1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 35 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ 36 * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.2 2007/04/22 04:08:59 dillon Exp $ 37 */ 38 39 #include "opt_param.h" 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/domain.h> 43 #include <sys/file.h> /* for maxfiles */ 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/malloc.h> 47 #include <sys/mbuf.h> 48 #include <sys/protosw.h> 49 #include <sys/resourcevar.h> 50 #include <sys/stat.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 54 #include <sys/thread2.h> 55 #include <sys/msgport2.h> 56 57 /* 58 * Routines to add and remove data from an mbuf queue. 59 * 60 * The routines sbappend() or sbappendrecord() are normally called to 61 * append new mbufs to a socket buffer. sbappendrecord() differs from 62 * sbappend() in that data supplied is treated as the beginning of a new 63 * record. sbappend() only begins a new record if the last mbuf in the 64 * sockbuf is marked M_EOR. 65 * 66 * To place a sender's address, optional access rights, and data in a 67 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be 68 * used. These functions also begin a new record. 69 * 70 * Reliable protocols may use the socket send buffer to hold data 71 * awaiting acknowledgement. Data is normally copied from a socket 72 * send buffer in a protocol with m_copy for output to a peer, 73 * and then removing the data from the socket buffer with sbdrop() 74 * or sbdroprecord() when the data is acknowledged by the peer. 75 */ 76 77 /* 78 * Append mbuf chain m to the last record in the socket buffer sb. 79 * The additional space associated the mbuf chain is recorded in sb. 80 * Empty mbufs are discarded and mbufs are compacted where possible. 81 * 82 * If M_EOR is set in the first or last mbuf of the last record, the 83 * mbuf chain is appended as a new record. M_EOR is usually just set 84 * in the last mbuf of the last record's mbuf chain (see sbcompress()), 85 * but this may be changed in the future since there is no real need 86 * to propogate the flag any more. 87 */ 88 void 89 sbappend(struct sockbuf *sb, struct mbuf *m) 90 { 91 struct mbuf *n; 92 93 if (m) { 94 n = sb->sb_lastrecord; 95 if (n) { 96 if (n->m_flags & M_EOR) { 97 sbappendrecord(sb, m); 98 return; 99 } 100 } 101 n = sb->sb_lastmbuf; 102 if (n) { 103 if (n->m_flags & M_EOR) { 104 sbappendrecord(sb, m); 105 return; 106 } 107 } 108 sbcompress(sb, m, n); 109 } 110 } 111 112 /* 113 * sbappendstream() is an optimized form of sbappend() for protocols 114 * such as TCP that only have one record in the socket buffer, are 115 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses 116 * sbappendstream() must use sbappendstream() exclusively. 117 */ 118 void 119 sbappendstream(struct sockbuf *sb, struct mbuf *m) 120 { 121 KKASSERT(m->m_nextpkt == NULL); 122 sbcompress(sb, m, sb->sb_lastmbuf); 123 } 124 125 #ifdef SOCKBUF_DEBUG 126 127 void 128 _sbcheck(struct sockbuf *sb) 129 { 130 struct mbuf *m; 131 struct mbuf *n = NULL; 132 u_long len = 0, mbcnt = 0; 133 134 for (m = sb->sb_mb; m; m = n) { 135 n = m->m_nextpkt; 136 if (n == NULL && sb->sb_lastrecord != m) { 137 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m); 138 panic("sbcheck1"); 139 140 } 141 for (; m; m = m->m_next) { 142 len += m->m_len; 143 mbcnt += MSIZE; 144 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 145 mbcnt += m->m_ext.ext_size; 146 if (n == NULL && m->m_next == NULL) { 147 if (sb->sb_lastmbuf != m) { 148 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m); 149 panic("sbcheck2"); 150 } 151 } 152 } 153 } 154 if (sb->sb_mb == NULL) { 155 if (sb->sb_lastrecord != NULL) { 156 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n", 157 sb, sb->sb_lastrecord); 158 panic("sbcheck3"); 159 } 160 if (sb->sb_lastmbuf != NULL) { 161 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n", 162 sb, sb->sb_lastmbuf); 163 panic("sbcheck4"); 164 } 165 } 166 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 167 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n", 168 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt); 169 panic("sbcheck5"); 170 } 171 } 172 173 #endif 174 175 /* 176 * Same as sbappend(), except the mbuf chain begins a new record. 177 */ 178 void 179 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 180 { 181 struct mbuf *firstmbuf; 182 struct mbuf *secondmbuf; 183 184 if (m0 == NULL) 185 return; 186 187 sbcheck(sb); 188 189 /* 190 * Break the first mbuf off from the rest of the mbuf chain. 191 */ 192 firstmbuf = m0; 193 secondmbuf = m0->m_next; 194 m0->m_next = NULL; 195 196 /* 197 * Insert the first mbuf of the m0 mbuf chain as the last record of 198 * the sockbuf. Note this permits zero length records! Keep the 199 * sockbuf state consistent. 200 */ 201 if (sb->sb_mb == NULL) 202 sb->sb_mb = firstmbuf; 203 else 204 sb->sb_lastrecord->m_nextpkt = firstmbuf; 205 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */ 206 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */ 207 208 /* 209 * propagate the EOR flag so sbcompress() can pick it up 210 */ 211 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) { 212 firstmbuf->m_flags &= ~M_EOR; 213 secondmbuf->m_flags |= M_EOR; 214 } 215 216 /* 217 * The succeeding call to sbcompress() omits accounting for 218 * the first mbuf, so do it here. 219 */ 220 sballoc(sb, firstmbuf); 221 222 /* Compact the rest of the mbuf chain in after the first mbuf. */ 223 sbcompress(sb, secondmbuf, firstmbuf); 224 } 225 226 /* 227 * Append address and data, and optionally, control (ancillary) data 228 * to the receive queue of a socket. If present, 229 * m0 must include a packet header with total length. 230 * Returns 0 if insufficient mbufs. 231 */ 232 int 233 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 234 struct mbuf *control) 235 { 236 struct mbuf *m, *n; 237 int eor; 238 239 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 240 panic("sbappendaddr"); 241 sbcheck(sb); 242 243 for (n = control; n; n = n->m_next) { 244 if (n->m_next == NULL) /* keep pointer to last control buf */ 245 break; 246 } 247 if (asa->sa_len > MLEN) 248 return (0); 249 MGET(m, MB_DONTWAIT, MT_SONAME); 250 if (m == NULL) 251 return (0); 252 KKASSERT(m->m_nextpkt == NULL); 253 m->m_len = asa->sa_len; 254 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 255 if (n) 256 n->m_next = m0; /* concatenate data to control */ 257 else 258 control = m0; 259 m->m_next = control; 260 for (n = m; n; n = n->m_next) 261 sballoc(sb, n); 262 263 if (sb->sb_mb == NULL) 264 sb->sb_mb = m; 265 else 266 sb->sb_lastrecord->m_nextpkt = m; 267 sb->sb_lastrecord = m; 268 269 /* 270 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 271 * so sbappend() can find it. 272 */ 273 eor = m->m_flags; 274 while (m->m_next) { 275 m->m_flags &= ~M_EOR; 276 m = m->m_next; 277 eor |= m->m_flags; 278 } 279 m->m_flags |= eor & M_EOR; 280 sb->sb_lastmbuf = m; 281 282 return (1); 283 } 284 285 /* 286 * Append control information followed by data. Both the control and data 287 * must be non-null. 288 */ 289 int 290 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 291 { 292 struct mbuf *n; 293 u_int length, cmbcnt, m0mbcnt; 294 int eor; 295 296 KASSERT(control != NULL, ("sbappendcontrol")); 297 KKASSERT(control->m_nextpkt == NULL); 298 sbcheck(sb); 299 300 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt); 301 302 KKASSERT(m0 != NULL); 303 304 n->m_next = m0; /* concatenate data to control */ 305 306 if (sb->sb_mb == NULL) 307 sb->sb_mb = control; 308 else 309 sb->sb_lastrecord->m_nextpkt = control; 310 sb->sb_lastrecord = control; 311 312 /* 313 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf 314 * so sbappend() can find it. 315 */ 316 eor = m0->m_flags; 317 while (m0->m_next) { 318 m0->m_flags &= ~M_EOR; 319 m0 = m0->m_next; 320 eor |= m0->m_flags; 321 } 322 m0->m_flags |= eor & M_EOR; 323 sb->sb_lastmbuf = m0; 324 325 sb->sb_cc += length; 326 sb->sb_mbcnt += cmbcnt + m0mbcnt; 327 328 return (1); 329 } 330 331 /* 332 * Compress mbuf chain m into the socket buffer sb following mbuf tailm. 333 * If tailm is null, the buffer is presumed empty. Also, as a side-effect, 334 * increment the sockbuf counts for each mbuf in the chain. 335 */ 336 void 337 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm) 338 { 339 int eor = 0; 340 struct mbuf *free_chain = NULL; 341 342 sbcheck(sb); 343 while (m) { 344 struct mbuf *o; 345 346 eor |= m->m_flags & M_EOR; 347 /* 348 * Disregard empty mbufs as long as we don't encounter 349 * an end-of-record or there is a trailing mbuf of 350 * the same type to propagate the EOR flag to. 351 * 352 * Defer the m_free() call because it can block and break 353 * the atomicy of the sockbuf. 354 */ 355 if (m->m_len == 0 && 356 (eor == 0 || 357 (((o = m->m_next) || (o = tailm)) && 358 o->m_type == m->m_type))) { 359 o = m->m_next; 360 m->m_next = free_chain; 361 free_chain = m; 362 m = o; 363 continue; 364 } 365 366 /* See if we can coalesce with preceding mbuf. */ 367 if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) && 368 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 369 m->m_len <= M_TRAILINGSPACE(tailm) && 370 tailm->m_type == m->m_type) { 371 bcopy(mtod(m, caddr_t), 372 mtod(tailm, caddr_t) + tailm->m_len, 373 (unsigned)m->m_len); 374 tailm->m_len += m->m_len; 375 sb->sb_cc += m->m_len; /* update sb counter */ 376 o = m->m_next; 377 m->m_next = free_chain; 378 free_chain = m; 379 m = o; 380 continue; 381 } 382 383 /* Insert whole mbuf. */ 384 if (tailm == NULL) { 385 KASSERT(sb->sb_mb == NULL, 386 ("sbcompress: sb_mb not NULL")); 387 sb->sb_mb = m; /* only mbuf in sockbuf */ 388 sb->sb_lastrecord = m; /* new last record */ 389 } else { 390 tailm->m_next = m; /* tack m on following tailm */ 391 } 392 sb->sb_lastmbuf = m; /* update last mbuf hint */ 393 394 tailm = m; /* just inserted mbuf becomes the new tail */ 395 m = m->m_next; /* advance to next mbuf */ 396 tailm->m_next = NULL; /* split inserted mbuf off from chain */ 397 398 /* update sb counters for just added mbuf */ 399 sballoc(sb, tailm); 400 401 /* clear EOR on intermediate mbufs */ 402 tailm->m_flags &= ~M_EOR; 403 } 404 405 /* 406 * Propogate EOR to the last mbuf 407 */ 408 if (eor) { 409 if (tailm) 410 tailm->m_flags |= eor; 411 else 412 kprintf("semi-panic: sbcompress"); 413 } 414 415 /* 416 * Clean up any defered frees. 417 */ 418 while (free_chain) 419 free_chain = m_free(free_chain); 420 421 sbcheck(sb); 422 } 423 424 /* 425 * Free all mbufs in a sockbuf. 426 * Check that all resources are reclaimed. 427 */ 428 void 429 sbflush(struct sockbuf *sb) 430 { 431 while (sb->sb_mbcnt) { 432 /* 433 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 434 * we would loop forever. Panic instead. 435 */ 436 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 437 break; 438 sbdrop(sb, (int)sb->sb_cc); 439 } 440 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf), 441 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p", 442 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf)); 443 } 444 445 /* 446 * Drop data from (the front of) a sockbuf. If the current record is 447 * exhausted this routine will move onto the next one and continue dropping 448 * data. 449 */ 450 void 451 sbdrop(struct sockbuf *sb, int len) 452 { 453 struct mbuf *m; 454 struct mbuf *free_chain = NULL; 455 456 sbcheck(sb); 457 crit_enter(); 458 459 m = sb->sb_mb; 460 while (m && len > 0) { 461 if (m->m_len > len) { 462 m->m_len -= len; 463 m->m_data += len; 464 sb->sb_cc -= len; 465 break; 466 } 467 len -= m->m_len; 468 m = sbunlinkmbuf(sb, m, &free_chain); 469 if (m == NULL && len) 470 m = sb->sb_mb; 471 } 472 473 /* 474 * Remove any trailing 0-length mbufs in the current record. If 475 * the last record for which data was removed is now empty, m will be 476 * NULL. 477 */ 478 while (m && m->m_len == 0) { 479 m = sbunlinkmbuf(sb, m, &free_chain); 480 } 481 crit_exit(); 482 if (free_chain) 483 m_freem(free_chain); 484 sbcheck(sb); 485 } 486 487 /* 488 * Drop a record off the front of a sockbuf and move the next record 489 * to the front. 490 * 491 * Must be called while holding a critical section. 492 */ 493 void 494 sbdroprecord(struct sockbuf *sb) 495 { 496 struct mbuf *m; 497 struct mbuf *n; 498 499 sbcheck(sb); 500 m = sb->sb_mb; 501 if (m) { 502 if ((sb->sb_mb = m->m_nextpkt) == NULL) { 503 sb->sb_lastrecord = NULL; 504 sb->sb_lastmbuf = NULL; 505 } 506 m->m_nextpkt = NULL; 507 for (n = m; n; n = n->m_next) 508 sbfree(sb, n); 509 m_freem(m); 510 sbcheck(sb); 511 } 512 } 513 514 /* 515 * Drop the first mbuf off the sockbuf and move the next mbuf to the front. 516 * Currently only the head mbuf of the sockbuf may be dropped this way. 517 * 518 * The next mbuf in the same record as the mbuf being removed is returned 519 * or NULL if the record is exhausted. Note that other records may remain 520 * in the sockbuf when NULL is returned. 521 * 522 * Must be called while holding a critical section. 523 */ 524 struct mbuf * 525 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain) 526 { 527 struct mbuf *n; 528 529 KKASSERT(sb->sb_mb == m); 530 sbfree(sb, m); 531 n = m->m_next; 532 if (n) { 533 sb->sb_mb = n; 534 if (sb->sb_lastrecord == m) 535 sb->sb_lastrecord = n; 536 KKASSERT(sb->sb_lastmbuf != m); 537 n->m_nextpkt = m->m_nextpkt; 538 } else { 539 sb->sb_mb = m->m_nextpkt; 540 if (sb->sb_lastrecord == m) { 541 KKASSERT(sb->sb_mb == NULL); 542 sb->sb_lastrecord = NULL; 543 } 544 if (sb->sb_mb == NULL) 545 sb->sb_lastmbuf = NULL; 546 } 547 m->m_nextpkt = NULL; 548 if (free_chain) { 549 m->m_next = *free_chain; 550 *free_chain = m; 551 } else { 552 m->m_next = NULL; 553 } 554 return(n); 555 } 556 557 /* 558 * Create a "control" mbuf containing the specified data 559 * with the specified type for presentation on a socket buffer. 560 */ 561 struct mbuf * 562 sbcreatecontrol(caddr_t p, int size, int type, int level) 563 { 564 struct cmsghdr *cp; 565 struct mbuf *m; 566 567 if (CMSG_SPACE((u_int)size) > MCLBYTES) 568 return (NULL); 569 m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL); 570 if (m == NULL) 571 return (NULL); 572 m->m_len = CMSG_SPACE(size); 573 cp = mtod(m, struct cmsghdr *); 574 if (p != NULL) 575 memcpy(CMSG_DATA(cp), p, size); 576 cp->cmsg_len = CMSG_LEN(size); 577 cp->cmsg_level = level; 578 cp->cmsg_type = type; 579 return (m); 580 } 581 582