1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 5 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 6 * 7 * This code is derived from software contributed to The DragonFly Project 8 * by Jeffrey M. Hsu. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 /* 37 * Copyright (c) 1982, 1986, 1988, 1991, 1993 38 * The Regents of the University of California. All rights reserved. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. All advertising materials mentioning features or use of this software 49 * must display the following acknowledgement: 50 * This product includes software developed by the University of 51 * California, Berkeley and its contributors. 52 * 4. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 69 * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $ 70 */ 71 72 #include "opt_param.h" 73 #include "opt_mbuf_stress_test.h" 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/file.h> 77 #include <sys/malloc.h> 78 #include <sys/mbuf.h> 79 #include <sys/kernel.h> 80 #include <sys/sysctl.h> 81 #include <sys/domain.h> 82 #include <sys/objcache.h> 83 #include <sys/tree.h> 84 #include <sys/protosw.h> 85 #include <sys/uio.h> 86 #include <sys/thread.h> 87 #include <sys/globaldata.h> 88 89 #include <sys/thread2.h> 90 #include <sys/spinlock2.h> 91 92 #include <machine/atomic.h> 93 #include <machine/limits.h> 94 95 #include <vm/vm.h> 96 #include <vm/vm_kern.h> 97 #include <vm/vm_extern.h> 98 99 #ifdef INVARIANTS 100 #include <machine/cpu.h> 101 #endif 102 103 /* 104 * mbuf cluster meta-data 105 */ 106 struct mbcluster { 107 int32_t mcl_refs; 108 void *mcl_data; 109 }; 110 111 /* 112 * mbuf tracking for debugging purposes 113 */ 114 #ifdef MBUF_DEBUG 115 116 static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack"); 117 118 struct mbctrack; 119 RB_HEAD(mbuf_rb_tree, mbtrack); 120 RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *); 121 122 struct mbtrack { 123 RB_ENTRY(mbtrack) rb_node; 124 int trackid; 125 struct mbuf *m; 126 }; 127 128 static int 129 mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2) 130 { 131 if (mb1->m < mb2->m) 132 return(-1); 133 if (mb1->m > mb2->m) 134 return(1); 135 return(0); 136 } 137 138 RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m); 139 140 struct mbuf_rb_tree mbuf_track_root; 141 static struct spinlock mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin); 142 143 static void 144 mbuftrack(struct mbuf *m) 145 { 146 struct mbtrack *mbt; 147 148 mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO); 149 spin_lock(&mbuf_track_spin); 150 mbt->m = m; 151 if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) { 152 spin_unlock(&mbuf_track_spin); 153 panic("mbuftrack: mbuf %p already being tracked", m); 154 } 155 spin_unlock(&mbuf_track_spin); 156 } 157 158 static void 159 mbufuntrack(struct mbuf *m) 160 { 161 struct mbtrack *mbt; 162 163 spin_lock(&mbuf_track_spin); 164 mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m); 165 if (mbt == NULL) { 166 spin_unlock(&mbuf_track_spin); 167 panic("mbufuntrack: mbuf %p was not tracked", m); 168 } else { 169 mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt); 170 spin_unlock(&mbuf_track_spin); 171 kfree(mbt, M_MTRACK); 172 } 173 } 174 175 void 176 mbuftrackid(struct mbuf *m, int trackid) 177 { 178 struct mbtrack *mbt; 179 struct mbuf *n; 180 181 spin_lock(&mbuf_track_spin); 182 while (m) { 183 n = m->m_nextpkt; 184 while (m) { 185 mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m); 186 if (mbt == NULL) { 187 spin_unlock(&mbuf_track_spin); 188 panic("mbuftrackid: mbuf %p not tracked", m); 189 } 190 mbt->trackid = trackid; 191 m = m->m_next; 192 } 193 m = n; 194 } 195 spin_unlock(&mbuf_track_spin); 196 } 197 198 static int 199 mbuftrack_callback(struct mbtrack *mbt, void *arg) 200 { 201 struct sysctl_req *req = arg; 202 char buf[64]; 203 int error; 204 205 ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid); 206 207 spin_unlock(&mbuf_track_spin); 208 error = SYSCTL_OUT(req, buf, strlen(buf)); 209 spin_lock(&mbuf_track_spin); 210 if (error) 211 return(-error); 212 return(0); 213 } 214 215 static int 216 mbuftrack_show(SYSCTL_HANDLER_ARGS) 217 { 218 int error; 219 220 spin_lock(&mbuf_track_spin); 221 error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL, 222 mbuftrack_callback, req); 223 spin_unlock(&mbuf_track_spin); 224 return (-error); 225 } 226 SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING, 227 0, 0, mbuftrack_show, "A", "Show all in-use mbufs"); 228 229 #else 230 231 #define mbuftrack(m) 232 #define mbufuntrack(m) 233 234 #endif 235 236 static void mbinit(void *); 237 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL) 238 239 static u_long mbtypes[SMP_MAXCPU][MT_NTYPES]; 240 241 static struct mbstat mbstat[SMP_MAXCPU]; 242 int max_linkhdr; 243 int max_protohdr; 244 int max_hdr; 245 int max_datalen; 246 int m_defragpackets; 247 int m_defragbytes; 248 int m_defraguseless; 249 int m_defragfailure; 250 #ifdef MBUF_STRESS_TEST 251 int m_defragrandomfailures; 252 #endif 253 254 struct objcache *mbuf_cache, *mbufphdr_cache; 255 struct objcache *mclmeta_cache, *mjclmeta_cache; 256 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache; 257 struct objcache *mbufjcluster_cache, *mbufphdrjcluster_cache; 258 259 int nmbclusters; 260 static int nmbjclusters; 261 int nmbufs; 262 263 static int mclph_cachefrac; 264 static int mcl_cachefrac; 265 266 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, 267 &max_linkhdr, 0, "Max size of a link-level header"); 268 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, 269 &max_protohdr, 0, "Max size of a protocol header"); 270 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, 271 "Max size of link+protocol headers"); 272 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, 273 &max_datalen, 0, "Max data payload size without headers"); 274 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, 275 &mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations"); 276 static int do_mbstat(SYSCTL_HANDLER_ARGS); 277 278 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD, 279 0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics"); 280 281 static int do_mbtypes(SYSCTL_HANDLER_ARGS); 282 283 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD, 284 0, 0, do_mbtypes, "LU", ""); 285 286 static int 287 do_mbstat(SYSCTL_HANDLER_ARGS) 288 { 289 struct mbstat mbstat_total; 290 struct mbstat *mbstat_totalp; 291 int i; 292 293 bzero(&mbstat_total, sizeof(mbstat_total)); 294 mbstat_totalp = &mbstat_total; 295 296 for (i = 0; i < ncpus; i++) 297 { 298 mbstat_total.m_mbufs += mbstat[i].m_mbufs; 299 mbstat_total.m_clusters += mbstat[i].m_clusters; 300 mbstat_total.m_spare += mbstat[i].m_spare; 301 mbstat_total.m_clfree += mbstat[i].m_clfree; 302 mbstat_total.m_drops += mbstat[i].m_drops; 303 mbstat_total.m_wait += mbstat[i].m_wait; 304 mbstat_total.m_drain += mbstat[i].m_drain; 305 mbstat_total.m_mcfail += mbstat[i].m_mcfail; 306 mbstat_total.m_mpfail += mbstat[i].m_mpfail; 307 308 } 309 /* 310 * The following fields are not cumulative fields so just 311 * get their values once. 312 */ 313 mbstat_total.m_msize = mbstat[0].m_msize; 314 mbstat_total.m_mclbytes = mbstat[0].m_mclbytes; 315 mbstat_total.m_minclsize = mbstat[0].m_minclsize; 316 mbstat_total.m_mlen = mbstat[0].m_mlen; 317 mbstat_total.m_mhlen = mbstat[0].m_mhlen; 318 319 return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req)); 320 } 321 322 static int 323 do_mbtypes(SYSCTL_HANDLER_ARGS) 324 { 325 u_long totals[MT_NTYPES]; 326 int i, j; 327 328 for (i = 0; i < MT_NTYPES; i++) 329 totals[i] = 0; 330 331 for (i = 0; i < ncpus; i++) 332 { 333 for (j = 0; j < MT_NTYPES; j++) 334 totals[j] += mbtypes[i][j]; 335 } 336 337 return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req)); 338 } 339 340 /* 341 * These are read-only because we do not currently have any code 342 * to adjust the objcache limits after the fact. The variables 343 * may only be set as boot-time tunables. 344 */ 345 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, 346 &nmbclusters, 0, "Maximum number of mbuf clusters available"); 347 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, 348 "Maximum number of mbufs available"); 349 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjclusters, CTLFLAG_RD, &nmbjclusters, 0, 350 "Maximum number of mbuf jclusters available"); 351 SYSCTL_INT(_kern_ipc, OID_AUTO, mclph_cachefrac, CTLFLAG_RD, 352 &mclph_cachefrac, 0, 353 "Fraction of cacheable mbuf clusters w/ pkthdr"); 354 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_cachefrac, CTLFLAG_RD, 355 &mcl_cachefrac, 0, "Fraction of cacheable mbuf clusters"); 356 357 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, 358 &m_defragpackets, 0, "Number of defragment packets"); 359 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, 360 &m_defragbytes, 0, "Number of defragment bytes"); 361 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, 362 &m_defraguseless, 0, "Number of useless defragment mbuf chain operations"); 363 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, 364 &m_defragfailure, 0, "Number of failed defragment mbuf chain operations"); 365 #ifdef MBUF_STRESS_TEST 366 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, 367 &m_defragrandomfailures, 0, ""); 368 #endif 369 370 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); 371 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl"); 372 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta"); 373 374 static void m_reclaim (void); 375 static void m_mclref(void *arg); 376 static void m_mclfree(void *arg); 377 378 /* 379 * NOTE: Default NMBUFS must take into account a possible DOS attack 380 * using fd passing on unix domain sockets. 381 */ 382 #ifndef NMBCLUSTERS 383 #define NMBCLUSTERS (512 + maxusers * 16) 384 #endif 385 #ifndef MCLPH_CACHEFRAC 386 #define MCLPH_CACHEFRAC 16 387 #endif 388 #ifndef MCL_CACHEFRAC 389 #define MCL_CACHEFRAC 4 390 #endif 391 #ifndef NMBJCLUSTERS 392 #define NMBJCLUSTERS 2048 393 #endif 394 #ifndef NMBUFS 395 #define NMBUFS (nmbclusters * 2 + maxfiles) 396 #endif 397 398 /* 399 * Perform sanity checks of tunables declared above. 400 */ 401 static void 402 tunable_mbinit(void *dummy) 403 { 404 /* 405 * This has to be done before VM init. 406 */ 407 nmbclusters = NMBCLUSTERS; 408 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 409 mclph_cachefrac = MCLPH_CACHEFRAC; 410 TUNABLE_INT_FETCH("kern.ipc.mclph_cachefrac", &mclph_cachefrac); 411 mcl_cachefrac = MCL_CACHEFRAC; 412 TUNABLE_INT_FETCH("kern.ipc.mcl_cachefrac", &mcl_cachefrac); 413 414 nmbjclusters = NMBJCLUSTERS; 415 TUNABLE_INT_FETCH("kern.ipc.nmbjclusters", &nmbjclusters); 416 417 nmbufs = NMBUFS; 418 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 419 420 /* Sanity checks */ 421 if (nmbufs < nmbclusters * 2) 422 nmbufs = nmbclusters * 2; 423 } 424 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY, 425 tunable_mbinit, NULL); 426 427 /* "number of clusters of pages" */ 428 #define NCL_INIT 1 429 430 #define NMB_INIT 16 431 432 /* 433 * The mbuf object cache only guarantees that m_next and m_nextpkt are 434 * NULL and that m_data points to the beginning of the data area. In 435 * particular, m_len and m_pkthdr.len are uninitialized. It is the 436 * responsibility of the caller to initialize those fields before use. 437 */ 438 439 static __inline boolean_t 440 mbuf_ctor(void *obj, void *private, int ocflags) 441 { 442 struct mbuf *m = obj; 443 444 m->m_next = NULL; 445 m->m_nextpkt = NULL; 446 m->m_data = m->m_dat; 447 m->m_flags = 0; 448 449 return (TRUE); 450 } 451 452 /* 453 * Initialize the mbuf and the packet header fields. 454 */ 455 static boolean_t 456 mbufphdr_ctor(void *obj, void *private, int ocflags) 457 { 458 struct mbuf *m = obj; 459 460 m->m_next = NULL; 461 m->m_nextpkt = NULL; 462 m->m_data = m->m_pktdat; 463 m->m_flags = M_PKTHDR | M_PHCACHE; 464 465 m->m_pkthdr.rcvif = NULL; /* eliminate XXX JH */ 466 SLIST_INIT(&m->m_pkthdr.tags); 467 m->m_pkthdr.csum_flags = 0; /* eliminate XXX JH */ 468 m->m_pkthdr.fw_flags = 0; /* eliminate XXX JH */ 469 470 return (TRUE); 471 } 472 473 /* 474 * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount. 475 */ 476 static boolean_t 477 mclmeta_ctor(void *obj, void *private, int ocflags) 478 { 479 struct mbcluster *cl = obj; 480 void *buf; 481 482 if (ocflags & M_NOWAIT) 483 buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO); 484 else 485 buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO); 486 if (buf == NULL) 487 return (FALSE); 488 cl->mcl_refs = 0; 489 cl->mcl_data = buf; 490 return (TRUE); 491 } 492 493 static boolean_t 494 mjclmeta_ctor(void *obj, void *private, int ocflags) 495 { 496 struct mbcluster *cl = obj; 497 void *buf; 498 499 if (ocflags & M_NOWAIT) 500 buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_NOWAIT | M_ZERO); 501 else 502 buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_INTWAIT | M_ZERO); 503 if (buf == NULL) 504 return (FALSE); 505 cl->mcl_refs = 0; 506 cl->mcl_data = buf; 507 return (TRUE); 508 } 509 510 static void 511 mclmeta_dtor(void *obj, void *private) 512 { 513 struct mbcluster *mcl = obj; 514 515 KKASSERT(mcl->mcl_refs == 0); 516 kfree(mcl->mcl_data, M_MBUFCL); 517 } 518 519 static void 520 linkjcluster(struct mbuf *m, struct mbcluster *cl, uint size) 521 { 522 /* 523 * Add the cluster to the mbuf. The caller will detect that the 524 * mbuf now has an attached cluster. 525 */ 526 m->m_ext.ext_arg = cl; 527 m->m_ext.ext_buf = cl->mcl_data; 528 m->m_ext.ext_ref = m_mclref; 529 m->m_ext.ext_free = m_mclfree; 530 m->m_ext.ext_size = size; 531 atomic_add_int(&cl->mcl_refs, 1); 532 533 m->m_data = m->m_ext.ext_buf; 534 m->m_flags |= M_EXT | M_EXT_CLUSTER; 535 } 536 537 static void 538 linkcluster(struct mbuf *m, struct mbcluster *cl) 539 { 540 linkjcluster(m, cl, MCLBYTES); 541 } 542 543 static boolean_t 544 mbufphdrcluster_ctor(void *obj, void *private, int ocflags) 545 { 546 struct mbuf *m = obj; 547 struct mbcluster *cl; 548 549 mbufphdr_ctor(obj, private, ocflags); 550 cl = objcache_get(mclmeta_cache, ocflags); 551 if (cl == NULL) { 552 ++mbstat[mycpu->gd_cpuid].m_drops; 553 return (FALSE); 554 } 555 m->m_flags |= M_CLCACHE; 556 linkcluster(m, cl); 557 return (TRUE); 558 } 559 560 static boolean_t 561 mbufphdrjcluster_ctor(void *obj, void *private, int ocflags) 562 { 563 struct mbuf *m = obj; 564 struct mbcluster *cl; 565 566 mbufphdr_ctor(obj, private, ocflags); 567 cl = objcache_get(mjclmeta_cache, ocflags); 568 if (cl == NULL) { 569 ++mbstat[mycpu->gd_cpuid].m_drops; 570 return (FALSE); 571 } 572 m->m_flags |= M_CLCACHE; 573 linkjcluster(m, cl, MJUMPAGESIZE); 574 return (TRUE); 575 } 576 577 static boolean_t 578 mbufcluster_ctor(void *obj, void *private, int ocflags) 579 { 580 struct mbuf *m = obj; 581 struct mbcluster *cl; 582 583 mbuf_ctor(obj, private, ocflags); 584 cl = objcache_get(mclmeta_cache, ocflags); 585 if (cl == NULL) { 586 ++mbstat[mycpu->gd_cpuid].m_drops; 587 return (FALSE); 588 } 589 m->m_flags |= M_CLCACHE; 590 linkcluster(m, cl); 591 return (TRUE); 592 } 593 594 static boolean_t 595 mbufjcluster_ctor(void *obj, void *private, int ocflags) 596 { 597 struct mbuf *m = obj; 598 struct mbcluster *cl; 599 600 mbuf_ctor(obj, private, ocflags); 601 cl = objcache_get(mjclmeta_cache, ocflags); 602 if (cl == NULL) { 603 ++mbstat[mycpu->gd_cpuid].m_drops; 604 return (FALSE); 605 } 606 m->m_flags |= M_CLCACHE; 607 linkjcluster(m, cl, MJUMPAGESIZE); 608 return (TRUE); 609 } 610 611 /* 612 * Used for both the cluster and cluster PHDR caches. 613 * 614 * The mbuf may have lost its cluster due to sharing, deal 615 * with the situation by checking M_EXT. 616 */ 617 static void 618 mbufcluster_dtor(void *obj, void *private) 619 { 620 struct mbuf *m = obj; 621 struct mbcluster *mcl; 622 623 if (m->m_flags & M_EXT) { 624 KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0); 625 mcl = m->m_ext.ext_arg; 626 KKASSERT(mcl->mcl_refs == 1); 627 mcl->mcl_refs = 0; 628 if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) 629 objcache_put(mjclmeta_cache, mcl); 630 else 631 objcache_put(mclmeta_cache, mcl); 632 } 633 } 634 635 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF }; 636 struct objcache_malloc_args mclmeta_malloc_args = 637 { sizeof(struct mbcluster), M_MCLMETA }; 638 639 /* ARGSUSED*/ 640 static void 641 mbinit(void *dummy) 642 { 643 int mb_limit, cl_limit, ncl_limit, jcl_limit; 644 int limit; 645 int i; 646 647 /* 648 * Initialize statistics 649 */ 650 for (i = 0; i < ncpus; i++) { 651 mbstat[i].m_msize = MSIZE; 652 mbstat[i].m_mclbytes = MCLBYTES; 653 mbstat[i].m_mjumpagesize = MJUMPAGESIZE; 654 mbstat[i].m_minclsize = MINCLSIZE; 655 mbstat[i].m_mlen = MLEN; 656 mbstat[i].m_mhlen = MHLEN; 657 } 658 659 /* 660 * Create objtect caches and save cluster limits, which will 661 * be used to adjust backing kmalloc pools' limit later. 662 */ 663 664 mb_limit = cl_limit = 0; 665 666 limit = nmbufs; 667 mbuf_cache = objcache_create("mbuf", 668 limit, 0, 669 mbuf_ctor, NULL, NULL, 670 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); 671 mb_limit += limit; 672 673 limit = nmbufs; 674 mbufphdr_cache = objcache_create("mbuf pkt hdr", 675 limit, nmbufs / 4, 676 mbufphdr_ctor, NULL, NULL, 677 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); 678 mb_limit += limit; 679 680 ncl_limit = nmbclusters; 681 mclmeta_cache = objcache_create("cluster mbuf", 682 ncl_limit, 0, 683 mclmeta_ctor, mclmeta_dtor, NULL, 684 objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args); 685 cl_limit += ncl_limit; 686 687 jcl_limit = nmbjclusters; 688 mjclmeta_cache = objcache_create("jcluster mbuf", 689 jcl_limit, 0, 690 mjclmeta_ctor, mclmeta_dtor, NULL, 691 objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args); 692 cl_limit += jcl_limit; 693 694 limit = nmbclusters; 695 mbufcluster_cache = objcache_create("mbuf + cluster", 696 limit, nmbclusters / mcl_cachefrac, 697 mbufcluster_ctor, mbufcluster_dtor, NULL, 698 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); 699 mb_limit += limit; 700 701 limit = nmbclusters; 702 mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster", 703 limit, nmbclusters / mclph_cachefrac, 704 mbufphdrcluster_ctor, mbufcluster_dtor, NULL, 705 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); 706 mb_limit += limit; 707 708 limit = nmbjclusters / 4; /* XXX really rarely used */ 709 mbufjcluster_cache = objcache_create("mbuf + jcluster", 710 limit, 0, 711 mbufjcluster_ctor, mbufcluster_dtor, NULL, 712 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); 713 mb_limit += limit; 714 715 limit = nmbjclusters; 716 mbufphdrjcluster_cache = objcache_create("mbuf pkt hdr + jcluster", 717 limit, nmbjclusters / 16, 718 mbufphdrjcluster_ctor, mbufcluster_dtor, NULL, 719 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); 720 mb_limit += limit; 721 722 /* 723 * Adjust backing kmalloc pools' limit 724 * 725 * NOTE: We raise the limit by another 1/8 to take the effect 726 * of loosememuse into account. 727 */ 728 cl_limit += cl_limit / 8; 729 kmalloc_raise_limit(mclmeta_malloc_args.mtype, 730 mclmeta_malloc_args.objsize * (size_t)cl_limit); 731 kmalloc_raise_limit(M_MBUFCL, 732 (MCLBYTES * (size_t)ncl_limit) + 733 (MJUMPAGESIZE * (size_t)jcl_limit)); 734 735 mb_limit += mb_limit / 8; 736 kmalloc_raise_limit(mbuf_malloc_args.mtype, 737 mbuf_malloc_args.objsize * (size_t)mb_limit); 738 } 739 740 /* 741 * Return the number of references to this mbuf's data. 0 is returned 742 * if the mbuf is not M_EXT, a reference count is returned if it is 743 * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT. 744 */ 745 int 746 m_sharecount(struct mbuf *m) 747 { 748 switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) { 749 case 0: 750 return (0); 751 case M_EXT: 752 return (99); 753 case M_EXT | M_EXT_CLUSTER: 754 return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs); 755 } 756 /* NOTREACHED */ 757 return (0); /* to shut up compiler */ 758 } 759 760 /* 761 * change mbuf to new type 762 */ 763 void 764 m_chtype(struct mbuf *m, int type) 765 { 766 struct globaldata *gd = mycpu; 767 768 ++mbtypes[gd->gd_cpuid][type]; 769 --mbtypes[gd->gd_cpuid][m->m_type]; 770 m->m_type = type; 771 } 772 773 static void 774 m_reclaim(void) 775 { 776 struct domain *dp; 777 struct protosw *pr; 778 779 kprintf("Debug: m_reclaim() called\n"); 780 781 SLIST_FOREACH(dp, &domains, dom_next) { 782 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { 783 if (pr->pr_drain) 784 (*pr->pr_drain)(); 785 } 786 } 787 ++mbstat[mycpu->gd_cpuid].m_drain; 788 } 789 790 static __inline void 791 updatestats(struct mbuf *m, int type) 792 { 793 struct globaldata *gd = mycpu; 794 795 m->m_type = type; 796 mbuftrack(m); 797 #ifdef MBUF_DEBUG 798 KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m)); 799 KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m)); 800 #endif 801 802 ++mbtypes[gd->gd_cpuid][type]; 803 ++mbstat[gd->gd_cpuid].m_mbufs; 804 805 } 806 807 /* 808 * Allocate an mbuf. 809 */ 810 struct mbuf * 811 m_get(int how, int type) 812 { 813 struct mbuf *m; 814 int ntries = 0; 815 int ocf = MBTOM(how); 816 817 retryonce: 818 819 m = objcache_get(mbuf_cache, ocf); 820 821 if (m == NULL) { 822 if ((how & MB_TRYWAIT) && ntries++ == 0) { 823 struct objcache *reclaimlist[] = { 824 mbufphdr_cache, 825 mbufcluster_cache, 826 mbufphdrcluster_cache, 827 mbufjcluster_cache, 828 mbufphdrjcluster_cache 829 }; 830 const int nreclaims = NELEM(reclaimlist); 831 832 if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf)) 833 m_reclaim(); 834 goto retryonce; 835 } 836 ++mbstat[mycpu->gd_cpuid].m_drops; 837 return (NULL); 838 } 839 #ifdef MBUF_DEBUG 840 KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m)); 841 #endif 842 m->m_len = 0; 843 844 updatestats(m, type); 845 return (m); 846 } 847 848 struct mbuf * 849 m_gethdr(int how, int type) 850 { 851 struct mbuf *m; 852 int ocf = MBTOM(how); 853 int ntries = 0; 854 855 retryonce: 856 857 m = objcache_get(mbufphdr_cache, ocf); 858 859 if (m == NULL) { 860 if ((how & MB_TRYWAIT) && ntries++ == 0) { 861 struct objcache *reclaimlist[] = { 862 mbuf_cache, 863 mbufcluster_cache, mbufphdrcluster_cache, 864 mbufjcluster_cache, mbufphdrjcluster_cache 865 }; 866 const int nreclaims = NELEM(reclaimlist); 867 868 if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf)) 869 m_reclaim(); 870 goto retryonce; 871 } 872 ++mbstat[mycpu->gd_cpuid].m_drops; 873 return (NULL); 874 } 875 #ifdef MBUF_DEBUG 876 KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m)); 877 #endif 878 m->m_len = 0; 879 m->m_pkthdr.len = 0; 880 881 updatestats(m, type); 882 return (m); 883 } 884 885 /* 886 * Get a mbuf (not a mbuf cluster!) and zero it. 887 * Deprecated. 888 */ 889 struct mbuf * 890 m_getclr(int how, int type) 891 { 892 struct mbuf *m; 893 894 m = m_get(how, type); 895 if (m != NULL) 896 bzero(m->m_data, MLEN); 897 return (m); 898 } 899 900 static struct mbuf * 901 m_getcl_cache(int how, short type, int flags, struct objcache *mbclc, 902 struct objcache *mbphclc) 903 { 904 struct mbuf *m = NULL; 905 int ocflags = MBTOM(how); 906 int ntries = 0; 907 908 retryonce: 909 910 if (flags & M_PKTHDR) 911 m = objcache_get(mbphclc, ocflags); 912 else 913 m = objcache_get(mbclc, ocflags); 914 915 if (m == NULL) { 916 if ((how & MB_TRYWAIT) && ntries++ == 0) { 917 struct objcache *reclaimlist[1]; 918 919 if (flags & M_PKTHDR) 920 reclaimlist[0] = mbclc; 921 else 922 reclaimlist[0] = mbphclc; 923 if (!objcache_reclaimlist(reclaimlist, 1, ocflags)) 924 m_reclaim(); 925 goto retryonce; 926 } 927 ++mbstat[mycpu->gd_cpuid].m_drops; 928 return (NULL); 929 } 930 931 #ifdef MBUF_DEBUG 932 KASSERT(m->m_data == m->m_ext.ext_buf, 933 ("mbuf %p: bad m_data in get", m)); 934 #endif 935 m->m_type = type; 936 m->m_len = 0; 937 m->m_pkthdr.len = 0; /* just do it unconditonally */ 938 939 mbuftrack(m); 940 941 ++mbtypes[mycpu->gd_cpuid][type]; 942 ++mbstat[mycpu->gd_cpuid].m_clusters; 943 return (m); 944 } 945 946 struct mbuf * 947 m_getjcl(int how, short type, int flags, size_t size) 948 { 949 struct objcache *mbclc, *mbphclc; 950 951 switch (size) { 952 case MCLBYTES: 953 mbclc = mbufcluster_cache; 954 mbphclc = mbufphdrcluster_cache; 955 break; 956 957 default: 958 mbclc = mbufjcluster_cache; 959 mbphclc = mbufphdrjcluster_cache; 960 break; 961 } 962 return m_getcl_cache(how, type, flags, mbclc, mbphclc); 963 } 964 965 /* 966 * Returns an mbuf with an attached cluster. 967 * Because many network drivers use this kind of buffers a lot, it is 968 * convenient to keep a small pool of free buffers of this kind. 969 * Even a small size such as 10 gives about 10% improvement in the 970 * forwarding rate in a bridge or router. 971 */ 972 struct mbuf * 973 m_getcl(int how, short type, int flags) 974 { 975 return m_getcl_cache(how, type, flags, 976 mbufcluster_cache, mbufphdrcluster_cache); 977 } 978 979 /* 980 * Allocate chain of requested length. 981 */ 982 struct mbuf * 983 m_getc(int len, int how, int type) 984 { 985 struct mbuf *n, *nfirst = NULL, **ntail = &nfirst; 986 int nsize; 987 988 while (len > 0) { 989 n = m_getl(len, how, type, 0, &nsize); 990 if (n == NULL) 991 goto failed; 992 n->m_len = 0; 993 *ntail = n; 994 ntail = &n->m_next; 995 len -= nsize; 996 } 997 return (nfirst); 998 999 failed: 1000 m_freem(nfirst); 1001 return (NULL); 1002 } 1003 1004 /* 1005 * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best) 1006 * and return a pointer to the head of the allocated chain. If m0 is 1007 * non-null, then we assume that it is a single mbuf or an mbuf chain to 1008 * which we want len bytes worth of mbufs and/or clusters attached, and so 1009 * if we succeed in allocating it, we will just return a pointer to m0. 1010 * 1011 * If we happen to fail at any point during the allocation, we will free 1012 * up everything we have already allocated and return NULL. 1013 * 1014 * Deprecated. Use m_getc() and m_cat() instead. 1015 */ 1016 struct mbuf * 1017 m_getm(struct mbuf *m0, int len, int type, int how) 1018 { 1019 struct mbuf *nfirst; 1020 1021 nfirst = m_getc(len, how, type); 1022 1023 if (m0 != NULL) { 1024 m_last(m0)->m_next = nfirst; 1025 return (m0); 1026 } 1027 1028 return (nfirst); 1029 } 1030 1031 /* 1032 * Adds a cluster to a normal mbuf, M_EXT is set on success. 1033 * Deprecated. Use m_getcl() instead. 1034 */ 1035 void 1036 m_mclget(struct mbuf *m, int how) 1037 { 1038 struct mbcluster *mcl; 1039 1040 KKASSERT((m->m_flags & M_EXT) == 0); 1041 mcl = objcache_get(mclmeta_cache, MBTOM(how)); 1042 if (mcl != NULL) { 1043 linkcluster(m, mcl); 1044 ++mbstat[mycpu->gd_cpuid].m_clusters; 1045 } else { 1046 ++mbstat[mycpu->gd_cpuid].m_drops; 1047 } 1048 } 1049 1050 /* 1051 * Updates to mbcluster must be MPSAFE. Only an entity which already has 1052 * a reference to the cluster can ref it, so we are in no danger of 1053 * racing an add with a subtract. But the operation must still be atomic 1054 * since multiple entities may have a reference on the cluster. 1055 * 1056 * m_mclfree() is almost the same but it must contend with two entities 1057 * freeing the cluster at the same time. 1058 */ 1059 static void 1060 m_mclref(void *arg) 1061 { 1062 struct mbcluster *mcl = arg; 1063 1064 atomic_add_int(&mcl->mcl_refs, 1); 1065 } 1066 1067 /* 1068 * When dereferencing a cluster we have to deal with a N->0 race, where 1069 * N entities free their references simultaniously. To do this we use 1070 * atomic_fetchadd_int(). 1071 */ 1072 static void 1073 m_mclfree(void *arg) 1074 { 1075 struct mbcluster *mcl = arg; 1076 1077 if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) { 1078 --mbstat[mycpu->gd_cpuid].m_clusters; 1079 objcache_put(mclmeta_cache, mcl); 1080 } 1081 } 1082 1083 /* 1084 * Free a single mbuf and any associated external storage. The successor, 1085 * if any, is returned. 1086 * 1087 * We do need to check non-first mbuf for m_aux, since some of existing 1088 * code does not call M_PREPEND properly. 1089 * (example: call to bpf_mtap from drivers) 1090 */ 1091 1092 #ifdef MBUF_DEBUG 1093 1094 struct mbuf * 1095 _m_free(struct mbuf *m, const char *func) 1096 1097 #else 1098 1099 struct mbuf * 1100 m_free(struct mbuf *m) 1101 1102 #endif 1103 { 1104 struct mbuf *n; 1105 struct globaldata *gd = mycpu; 1106 1107 KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m)); 1108 KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m)); 1109 --mbtypes[gd->gd_cpuid][m->m_type]; 1110 1111 n = m->m_next; 1112 1113 /* 1114 * Make sure the mbuf is in constructed state before returning it 1115 * to the objcache. 1116 */ 1117 m->m_next = NULL; 1118 mbufuntrack(m); 1119 #ifdef MBUF_DEBUG 1120 m->m_hdr.mh_lastfunc = func; 1121 #endif 1122 #ifdef notyet 1123 KKASSERT(m->m_nextpkt == NULL); 1124 #else 1125 if (m->m_nextpkt != NULL) { 1126 static int afewtimes = 10; 1127 1128 if (afewtimes-- > 0) { 1129 kprintf("mfree: m->m_nextpkt != NULL\n"); 1130 print_backtrace(-1); 1131 } 1132 m->m_nextpkt = NULL; 1133 } 1134 #endif 1135 if (m->m_flags & M_PKTHDR) { 1136 m_tag_delete_chain(m); /* eliminate XXX JH */ 1137 } 1138 1139 m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE); 1140 1141 /* 1142 * Clean the M_PKTHDR state so we can return the mbuf to its original 1143 * cache. This is based on the PHCACHE flag which tells us whether 1144 * the mbuf was originally allocated out of a packet-header cache 1145 * or a non-packet-header cache. 1146 */ 1147 if (m->m_flags & M_PHCACHE) { 1148 m->m_flags |= M_PKTHDR; 1149 m->m_pkthdr.rcvif = NULL; /* eliminate XXX JH */ 1150 m->m_pkthdr.csum_flags = 0; /* eliminate XXX JH */ 1151 m->m_pkthdr.fw_flags = 0; /* eliminate XXX JH */ 1152 SLIST_INIT(&m->m_pkthdr.tags); 1153 } 1154 1155 /* 1156 * Handle remaining flags combinations. M_CLCACHE tells us whether 1157 * the mbuf was originally allocated from a cluster cache or not, 1158 * and is totally separate from whether the mbuf is currently 1159 * associated with a cluster. 1160 */ 1161 switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) { 1162 case M_CLCACHE | M_EXT | M_EXT_CLUSTER: 1163 /* 1164 * mbuf+cluster cache case. The mbuf was allocated from the 1165 * combined mbuf_cluster cache and can be returned to the 1166 * cache if the cluster hasn't been shared. 1167 */ 1168 if (m_sharecount(m) == 1) { 1169 /* 1170 * The cluster has not been shared, we can just 1171 * reset the data pointer and return the mbuf 1172 * to the cluster cache. Note that the reference 1173 * count is left intact (it is still associated with 1174 * an mbuf). 1175 */ 1176 m->m_data = m->m_ext.ext_buf; 1177 if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) { 1178 if (m->m_flags & M_PHCACHE) 1179 objcache_put(mbufphdrjcluster_cache, m); 1180 else 1181 objcache_put(mbufjcluster_cache, m); 1182 } else { 1183 if (m->m_flags & M_PHCACHE) 1184 objcache_put(mbufphdrcluster_cache, m); 1185 else 1186 objcache_put(mbufcluster_cache, m); 1187 } 1188 --mbstat[mycpu->gd_cpuid].m_clusters; 1189 } else { 1190 /* 1191 * Hell. Someone else has a ref on this cluster, 1192 * we have to disconnect it which means we can't 1193 * put it back into the mbufcluster_cache, we 1194 * have to destroy the mbuf. 1195 * 1196 * Other mbuf references to the cluster will typically 1197 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE. 1198 * 1199 * XXX we could try to connect another cluster to 1200 * it. 1201 */ 1202 m->m_ext.ext_free(m->m_ext.ext_arg); 1203 m->m_flags &= ~(M_EXT | M_EXT_CLUSTER); 1204 if (m->m_ext.ext_size == MCLBYTES) { 1205 if (m->m_flags & M_PHCACHE) 1206 objcache_dtor(mbufphdrcluster_cache, m); 1207 else 1208 objcache_dtor(mbufcluster_cache, m); 1209 } else { 1210 if (m->m_flags & M_PHCACHE) 1211 objcache_dtor(mbufphdrjcluster_cache, m); 1212 else 1213 objcache_dtor(mbufjcluster_cache, m); 1214 } 1215 } 1216 break; 1217 case M_EXT | M_EXT_CLUSTER: 1218 case M_EXT: 1219 /* 1220 * Normal cluster association case, disconnect the cluster from 1221 * the mbuf. The cluster may or may not be custom. 1222 */ 1223 m->m_ext.ext_free(m->m_ext.ext_arg); 1224 m->m_flags &= ~(M_EXT | M_EXT_CLUSTER); 1225 /* fall through */ 1226 case 0: 1227 /* 1228 * return the mbuf to the mbuf cache. 1229 */ 1230 if (m->m_flags & M_PHCACHE) { 1231 m->m_data = m->m_pktdat; 1232 objcache_put(mbufphdr_cache, m); 1233 } else { 1234 m->m_data = m->m_dat; 1235 objcache_put(mbuf_cache, m); 1236 } 1237 --mbstat[mycpu->gd_cpuid].m_mbufs; 1238 break; 1239 default: 1240 if (!panicstr) 1241 panic("bad mbuf flags %p %08x", m, m->m_flags); 1242 break; 1243 } 1244 return (n); 1245 } 1246 1247 #ifdef MBUF_DEBUG 1248 1249 void 1250 _m_freem(struct mbuf *m, const char *func) 1251 { 1252 while (m) 1253 m = _m_free(m, func); 1254 } 1255 1256 #else 1257 1258 void 1259 m_freem(struct mbuf *m) 1260 { 1261 while (m) 1262 m = m_free(m); 1263 } 1264 1265 #endif 1266 1267 void 1268 m_extadd(struct mbuf *m, caddr_t buf, u_int size, void (*reff)(void *), 1269 void (*freef)(void *), void *arg) 1270 { 1271 m->m_ext.ext_arg = arg; 1272 m->m_ext.ext_buf = buf; 1273 m->m_ext.ext_ref = reff; 1274 m->m_ext.ext_free = freef; 1275 m->m_ext.ext_size = size; 1276 reff(arg); 1277 m->m_data = buf; 1278 m->m_flags |= M_EXT; 1279 } 1280 1281 /* 1282 * mbuf utility routines 1283 */ 1284 1285 /* 1286 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and 1287 * copy junk along. 1288 */ 1289 struct mbuf * 1290 m_prepend(struct mbuf *m, int len, int how) 1291 { 1292 struct mbuf *mn; 1293 1294 if (m->m_flags & M_PKTHDR) 1295 mn = m_gethdr(how, m->m_type); 1296 else 1297 mn = m_get(how, m->m_type); 1298 if (mn == NULL) { 1299 m_freem(m); 1300 return (NULL); 1301 } 1302 if (m->m_flags & M_PKTHDR) 1303 M_MOVE_PKTHDR(mn, m); 1304 mn->m_next = m; 1305 m = mn; 1306 if (len < MHLEN) 1307 MH_ALIGN(m, len); 1308 m->m_len = len; 1309 return (m); 1310 } 1311 1312 /* 1313 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 1314 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 1315 * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller. 1316 * Note that the copy is read-only, because clusters are not copied, 1317 * only their reference counts are incremented. 1318 */ 1319 struct mbuf * 1320 m_copym(const struct mbuf *m, int off0, int len, int wait) 1321 { 1322 struct mbuf *n, **np; 1323 int off = off0; 1324 struct mbuf *top; 1325 int copyhdr = 0; 1326 1327 KASSERT(off >= 0, ("m_copym, negative off %d", off)); 1328 KASSERT(len >= 0, ("m_copym, negative len %d", len)); 1329 if (off == 0 && (m->m_flags & M_PKTHDR)) 1330 copyhdr = 1; 1331 while (off > 0) { 1332 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); 1333 if (off < m->m_len) 1334 break; 1335 off -= m->m_len; 1336 m = m->m_next; 1337 } 1338 np = ⊤ 1339 top = NULL; 1340 while (len > 0) { 1341 if (m == NULL) { 1342 KASSERT(len == M_COPYALL, 1343 ("m_copym, length > size of mbuf chain")); 1344 break; 1345 } 1346 /* 1347 * Because we are sharing any cluster attachment below, 1348 * be sure to get an mbuf that does not have a cluster 1349 * associated with it. 1350 */ 1351 if (copyhdr) 1352 n = m_gethdr(wait, m->m_type); 1353 else 1354 n = m_get(wait, m->m_type); 1355 *np = n; 1356 if (n == NULL) 1357 goto nospace; 1358 if (copyhdr) { 1359 if (!m_dup_pkthdr(n, m, wait)) 1360 goto nospace; 1361 if (len == M_COPYALL) 1362 n->m_pkthdr.len -= off0; 1363 else 1364 n->m_pkthdr.len = len; 1365 copyhdr = 0; 1366 } 1367 n->m_len = min(len, m->m_len - off); 1368 if (m->m_flags & M_EXT) { 1369 KKASSERT((n->m_flags & M_EXT) == 0); 1370 n->m_data = m->m_data + off; 1371 m->m_ext.ext_ref(m->m_ext.ext_arg); 1372 n->m_ext = m->m_ext; 1373 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER); 1374 } else { 1375 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 1376 (unsigned)n->m_len); 1377 } 1378 if (len != M_COPYALL) 1379 len -= n->m_len; 1380 off = 0; 1381 m = m->m_next; 1382 np = &n->m_next; 1383 } 1384 if (top == NULL) 1385 ++mbstat[mycpu->gd_cpuid].m_mcfail; 1386 return (top); 1387 nospace: 1388 m_freem(top); 1389 ++mbstat[mycpu->gd_cpuid].m_mcfail; 1390 return (NULL); 1391 } 1392 1393 /* 1394 * Copy an entire packet, including header (which must be present). 1395 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. 1396 * Note that the copy is read-only, because clusters are not copied, 1397 * only their reference counts are incremented. 1398 * Preserve alignment of the first mbuf so if the creator has left 1399 * some room at the beginning (e.g. for inserting protocol headers) 1400 * the copies also have the room available. 1401 */ 1402 struct mbuf * 1403 m_copypacket(struct mbuf *m, int how) 1404 { 1405 struct mbuf *top, *n, *o; 1406 1407 n = m_gethdr(how, m->m_type); 1408 top = n; 1409 if (!n) 1410 goto nospace; 1411 1412 if (!m_dup_pkthdr(n, m, how)) 1413 goto nospace; 1414 n->m_len = m->m_len; 1415 if (m->m_flags & M_EXT) { 1416 KKASSERT((n->m_flags & M_EXT) == 0); 1417 n->m_data = m->m_data; 1418 m->m_ext.ext_ref(m->m_ext.ext_arg); 1419 n->m_ext = m->m_ext; 1420 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER); 1421 } else { 1422 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); 1423 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 1424 } 1425 1426 m = m->m_next; 1427 while (m) { 1428 o = m_get(how, m->m_type); 1429 if (!o) 1430 goto nospace; 1431 1432 n->m_next = o; 1433 n = n->m_next; 1434 1435 n->m_len = m->m_len; 1436 if (m->m_flags & M_EXT) { 1437 KKASSERT((n->m_flags & M_EXT) == 0); 1438 n->m_data = m->m_data; 1439 m->m_ext.ext_ref(m->m_ext.ext_arg); 1440 n->m_ext = m->m_ext; 1441 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER); 1442 } else { 1443 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 1444 } 1445 1446 m = m->m_next; 1447 } 1448 return top; 1449 nospace: 1450 m_freem(top); 1451 ++mbstat[mycpu->gd_cpuid].m_mcfail; 1452 return (NULL); 1453 } 1454 1455 /* 1456 * Copy data from an mbuf chain starting "off" bytes from the beginning, 1457 * continuing for "len" bytes, into the indicated buffer. 1458 */ 1459 void 1460 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) 1461 { 1462 unsigned count; 1463 1464 KASSERT(off >= 0, ("m_copydata, negative off %d", off)); 1465 KASSERT(len >= 0, ("m_copydata, negative len %d", len)); 1466 while (off > 0) { 1467 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); 1468 if (off < m->m_len) 1469 break; 1470 off -= m->m_len; 1471 m = m->m_next; 1472 } 1473 while (len > 0) { 1474 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); 1475 count = min(m->m_len - off, len); 1476 bcopy(mtod(m, caddr_t) + off, cp, count); 1477 len -= count; 1478 cp += count; 1479 off = 0; 1480 m = m->m_next; 1481 } 1482 } 1483 1484 /* 1485 * Copy a packet header mbuf chain into a completely new chain, including 1486 * copying any mbuf clusters. Use this instead of m_copypacket() when 1487 * you need a writable copy of an mbuf chain. 1488 */ 1489 struct mbuf * 1490 m_dup(struct mbuf *m, int how) 1491 { 1492 struct mbuf **p, *top = NULL; 1493 int remain, moff, nsize; 1494 1495 /* Sanity check */ 1496 if (m == NULL) 1497 return (NULL); 1498 KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__)); 1499 1500 /* While there's more data, get a new mbuf, tack it on, and fill it */ 1501 remain = m->m_pkthdr.len; 1502 moff = 0; 1503 p = ⊤ 1504 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ 1505 struct mbuf *n; 1506 1507 /* Get the next new mbuf */ 1508 n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0, 1509 &nsize); 1510 if (n == NULL) 1511 goto nospace; 1512 if (top == NULL) 1513 if (!m_dup_pkthdr(n, m, how)) 1514 goto nospace0; 1515 1516 /* Link it into the new chain */ 1517 *p = n; 1518 p = &n->m_next; 1519 1520 /* Copy data from original mbuf(s) into new mbuf */ 1521 n->m_len = 0; 1522 while (n->m_len < nsize && m != NULL) { 1523 int chunk = min(nsize - n->m_len, m->m_len - moff); 1524 1525 bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); 1526 moff += chunk; 1527 n->m_len += chunk; 1528 remain -= chunk; 1529 if (moff == m->m_len) { 1530 m = m->m_next; 1531 moff = 0; 1532 } 1533 } 1534 1535 /* Check correct total mbuf length */ 1536 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), 1537 ("%s: bogus m_pkthdr.len", __func__)); 1538 } 1539 return (top); 1540 1541 nospace: 1542 m_freem(top); 1543 nospace0: 1544 ++mbstat[mycpu->gd_cpuid].m_mcfail; 1545 return (NULL); 1546 } 1547 1548 /* 1549 * Copy the non-packet mbuf data chain into a new set of mbufs, including 1550 * copying any mbuf clusters. This is typically used to realign a data 1551 * chain by nfs_realign(). 1552 * 1553 * The original chain is left intact. how should be MB_WAIT or MB_DONTWAIT 1554 * and NULL can be returned if MB_DONTWAIT is passed. 1555 * 1556 * Be careful to use cluster mbufs, a large mbuf chain converted to non 1557 * cluster mbufs can exhaust our supply of mbufs. 1558 */ 1559 struct mbuf * 1560 m_dup_data(struct mbuf *m, int how) 1561 { 1562 struct mbuf **p, *n, *top = NULL; 1563 int mlen, moff, chunk, gsize, nsize; 1564 1565 /* 1566 * Degenerate case 1567 */ 1568 if (m == NULL) 1569 return (NULL); 1570 1571 /* 1572 * Optimize the mbuf allocation but do not get too carried away. 1573 */ 1574 if (m->m_next || m->m_len > MLEN) 1575 if (m->m_flags & M_EXT && m->m_ext.ext_size == MCLBYTES) 1576 gsize = MCLBYTES; 1577 else 1578 gsize = MJUMPAGESIZE; 1579 else 1580 gsize = MLEN; 1581 1582 /* Chain control */ 1583 p = ⊤ 1584 n = NULL; 1585 nsize = 0; 1586 1587 /* 1588 * Scan the mbuf chain until nothing is left, the new mbuf chain 1589 * will be allocated on the fly as needed. 1590 */ 1591 while (m) { 1592 mlen = m->m_len; 1593 moff = 0; 1594 1595 while (mlen) { 1596 KKASSERT(m->m_type == MT_DATA); 1597 if (n == NULL) { 1598 n = m_getl(gsize, how, MT_DATA, 0, &nsize); 1599 n->m_len = 0; 1600 if (n == NULL) 1601 goto nospace; 1602 *p = n; 1603 p = &n->m_next; 1604 } 1605 chunk = imin(mlen, nsize); 1606 bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); 1607 mlen -= chunk; 1608 moff += chunk; 1609 n->m_len += chunk; 1610 nsize -= chunk; 1611 if (nsize == 0) 1612 n = NULL; 1613 } 1614 m = m->m_next; 1615 } 1616 *p = NULL; 1617 return(top); 1618 nospace: 1619 *p = NULL; 1620 m_freem(top); 1621 ++mbstat[mycpu->gd_cpuid].m_mcfail; 1622 return (NULL); 1623 } 1624 1625 /* 1626 * Concatenate mbuf chain n to m. 1627 * Both chains must be of the same type (e.g. MT_DATA). 1628 * Any m_pkthdr is not updated. 1629 */ 1630 void 1631 m_cat(struct mbuf *m, struct mbuf *n) 1632 { 1633 m = m_last(m); 1634 while (n) { 1635 if (m->m_flags & M_EXT || 1636 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { 1637 /* just join the two chains */ 1638 m->m_next = n; 1639 return; 1640 } 1641 /* splat the data from one into the other */ 1642 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 1643 (u_int)n->m_len); 1644 m->m_len += n->m_len; 1645 n = m_free(n); 1646 } 1647 } 1648 1649 void 1650 m_adj(struct mbuf *mp, int req_len) 1651 { 1652 int len = req_len; 1653 struct mbuf *m; 1654 int count; 1655 1656 if ((m = mp) == NULL) 1657 return; 1658 if (len >= 0) { 1659 /* 1660 * Trim from head. 1661 */ 1662 while (m != NULL && len > 0) { 1663 if (m->m_len <= len) { 1664 len -= m->m_len; 1665 m->m_len = 0; 1666 m = m->m_next; 1667 } else { 1668 m->m_len -= len; 1669 m->m_data += len; 1670 len = 0; 1671 } 1672 } 1673 m = mp; 1674 if (mp->m_flags & M_PKTHDR) 1675 m->m_pkthdr.len -= (req_len - len); 1676 } else { 1677 /* 1678 * Trim from tail. Scan the mbuf chain, 1679 * calculating its length and finding the last mbuf. 1680 * If the adjustment only affects this mbuf, then just 1681 * adjust and return. Otherwise, rescan and truncate 1682 * after the remaining size. 1683 */ 1684 len = -len; 1685 count = 0; 1686 for (;;) { 1687 count += m->m_len; 1688 if (m->m_next == NULL) 1689 break; 1690 m = m->m_next; 1691 } 1692 if (m->m_len >= len) { 1693 m->m_len -= len; 1694 if (mp->m_flags & M_PKTHDR) 1695 mp->m_pkthdr.len -= len; 1696 return; 1697 } 1698 count -= len; 1699 if (count < 0) 1700 count = 0; 1701 /* 1702 * Correct length for chain is "count". 1703 * Find the mbuf with last data, adjust its length, 1704 * and toss data from remaining mbufs on chain. 1705 */ 1706 m = mp; 1707 if (m->m_flags & M_PKTHDR) 1708 m->m_pkthdr.len = count; 1709 for (; m; m = m->m_next) { 1710 if (m->m_len >= count) { 1711 m->m_len = count; 1712 break; 1713 } 1714 count -= m->m_len; 1715 } 1716 while (m->m_next) 1717 (m = m->m_next) ->m_len = 0; 1718 } 1719 } 1720 1721 /* 1722 * Set the m_data pointer of a newly-allocated mbuf 1723 * to place an object of the specified size at the 1724 * end of the mbuf, longword aligned. 1725 */ 1726 void 1727 m_align(struct mbuf *m, int len) 1728 { 1729 int adjust; 1730 1731 if (m->m_flags & M_EXT) 1732 adjust = m->m_ext.ext_size - len; 1733 else if (m->m_flags & M_PKTHDR) 1734 adjust = MHLEN - len; 1735 else 1736 adjust = MLEN - len; 1737 m->m_data += adjust &~ (sizeof(long)-1); 1738 } 1739 1740 /* 1741 * Create a writable copy of the mbuf chain. While doing this 1742 * we compact the chain with a goal of producing a chain with 1743 * at most two mbufs. The second mbuf in this chain is likely 1744 * to be a cluster. The primary purpose of this work is to create 1745 * a writable packet for encryption, compression, etc. The 1746 * secondary goal is to linearize the data so the data can be 1747 * passed to crypto hardware in the most efficient manner possible. 1748 */ 1749 struct mbuf * 1750 m_unshare(struct mbuf *m0, int how) 1751 { 1752 struct mbuf *m, *mprev; 1753 struct mbuf *n, *mfirst, *mlast; 1754 int len, off; 1755 1756 mprev = NULL; 1757 for (m = m0; m != NULL; m = mprev->m_next) { 1758 /* 1759 * Regular mbufs are ignored unless there's a cluster 1760 * in front of it that we can use to coalesce. We do 1761 * the latter mainly so later clusters can be coalesced 1762 * also w/o having to handle them specially (i.e. convert 1763 * mbuf+cluster -> cluster). This optimization is heavily 1764 * influenced by the assumption that we're running over 1765 * Ethernet where MCLBYTES is large enough that the max 1766 * packet size will permit lots of coalescing into a 1767 * single cluster. This in turn permits efficient 1768 * crypto operations, especially when using hardware. 1769 */ 1770 if ((m->m_flags & M_EXT) == 0) { 1771 if (mprev && (mprev->m_flags & M_EXT) && 1772 m->m_len <= M_TRAILINGSPACE(mprev)) { 1773 /* XXX: this ignores mbuf types */ 1774 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 1775 mtod(m, caddr_t), m->m_len); 1776 mprev->m_len += m->m_len; 1777 mprev->m_next = m->m_next; /* unlink from chain */ 1778 m_free(m); /* reclaim mbuf */ 1779 } else { 1780 mprev = m; 1781 } 1782 continue; 1783 } 1784 /* 1785 * Writable mbufs are left alone (for now). 1786 */ 1787 if (M_WRITABLE(m)) { 1788 mprev = m; 1789 continue; 1790 } 1791 1792 /* 1793 * Not writable, replace with a copy or coalesce with 1794 * the previous mbuf if possible (since we have to copy 1795 * it anyway, we try to reduce the number of mbufs and 1796 * clusters so that future work is easier). 1797 */ 1798 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); 1799 /* NB: we only coalesce into a cluster or larger */ 1800 if (mprev != NULL && (mprev->m_flags & M_EXT) && 1801 m->m_len <= M_TRAILINGSPACE(mprev)) { 1802 /* XXX: this ignores mbuf types */ 1803 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 1804 mtod(m, caddr_t), m->m_len); 1805 mprev->m_len += m->m_len; 1806 mprev->m_next = m->m_next; /* unlink from chain */ 1807 m_free(m); /* reclaim mbuf */ 1808 continue; 1809 } 1810 1811 /* 1812 * Allocate new space to hold the copy... 1813 */ 1814 /* XXX why can M_PKTHDR be set past the first mbuf? */ 1815 if (mprev == NULL && (m->m_flags & M_PKTHDR)) { 1816 /* 1817 * NB: if a packet header is present we must 1818 * allocate the mbuf separately from any cluster 1819 * because M_MOVE_PKTHDR will smash the data 1820 * pointer and drop the M_EXT marker. 1821 */ 1822 MGETHDR(n, how, m->m_type); 1823 if (n == NULL) { 1824 m_freem(m0); 1825 return (NULL); 1826 } 1827 M_MOVE_PKTHDR(n, m); 1828 MCLGET(n, how); 1829 if ((n->m_flags & M_EXT) == 0) { 1830 m_free(n); 1831 m_freem(m0); 1832 return (NULL); 1833 } 1834 } else { 1835 n = m_getcl(how, m->m_type, m->m_flags); 1836 if (n == NULL) { 1837 m_freem(m0); 1838 return (NULL); 1839 } 1840 } 1841 /* 1842 * ... and copy the data. We deal with jumbo mbufs 1843 * (i.e. m_len > MCLBYTES) by splitting them into 1844 * clusters. We could just malloc a buffer and make 1845 * it external but too many device drivers don't know 1846 * how to break up the non-contiguous memory when 1847 * doing DMA. 1848 */ 1849 len = m->m_len; 1850 off = 0; 1851 mfirst = n; 1852 mlast = NULL; 1853 for (;;) { 1854 int cc = min(len, MCLBYTES); 1855 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); 1856 n->m_len = cc; 1857 if (mlast != NULL) 1858 mlast->m_next = n; 1859 mlast = n; 1860 1861 len -= cc; 1862 if (len <= 0) 1863 break; 1864 off += cc; 1865 1866 n = m_getcl(how, m->m_type, m->m_flags); 1867 if (n == NULL) { 1868 m_freem(mfirst); 1869 m_freem(m0); 1870 return (NULL); 1871 } 1872 } 1873 n->m_next = m->m_next; 1874 if (mprev == NULL) 1875 m0 = mfirst; /* new head of chain */ 1876 else 1877 mprev->m_next = mfirst; /* replace old mbuf */ 1878 m_free(m); /* release old mbuf */ 1879 mprev = mfirst; 1880 } 1881 return (m0); 1882 } 1883 1884 /* 1885 * Rearrange an mbuf chain so that len bytes are contiguous 1886 * and in the data area of an mbuf (so that mtod will work for a structure 1887 * of size len). Returns the resulting mbuf chain on success, frees it and 1888 * returns null on failure. If there is room, it will add up to 1889 * max_protohdr-len extra bytes to the contiguous region in an attempt to 1890 * avoid being called next time. 1891 */ 1892 struct mbuf * 1893 m_pullup(struct mbuf *n, int len) 1894 { 1895 struct mbuf *m; 1896 int count; 1897 int space; 1898 1899 /* 1900 * If first mbuf has no cluster, and has room for len bytes 1901 * without shifting current data, pullup into it, 1902 * otherwise allocate a new mbuf to prepend to the chain. 1903 */ 1904 if (!(n->m_flags & M_EXT) && 1905 n->m_data + len < &n->m_dat[MLEN] && 1906 n->m_next) { 1907 if (n->m_len >= len) 1908 return (n); 1909 m = n; 1910 n = n->m_next; 1911 len -= m->m_len; 1912 } else { 1913 if (len > MHLEN) 1914 goto bad; 1915 if (n->m_flags & M_PKTHDR) 1916 m = m_gethdr(MB_DONTWAIT, n->m_type); 1917 else 1918 m = m_get(MB_DONTWAIT, n->m_type); 1919 if (m == NULL) 1920 goto bad; 1921 m->m_len = 0; 1922 if (n->m_flags & M_PKTHDR) 1923 M_MOVE_PKTHDR(m, n); 1924 } 1925 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 1926 do { 1927 count = min(min(max(len, max_protohdr), space), n->m_len); 1928 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 1929 (unsigned)count); 1930 len -= count; 1931 m->m_len += count; 1932 n->m_len -= count; 1933 space -= count; 1934 if (n->m_len) 1935 n->m_data += count; 1936 else 1937 n = m_free(n); 1938 } while (len > 0 && n); 1939 if (len > 0) { 1940 m_free(m); 1941 goto bad; 1942 } 1943 m->m_next = n; 1944 return (m); 1945 bad: 1946 m_freem(n); 1947 ++mbstat[mycpu->gd_cpuid].m_mcfail; 1948 return (NULL); 1949 } 1950 1951 /* 1952 * Partition an mbuf chain in two pieces, returning the tail -- 1953 * all but the first len0 bytes. In case of failure, it returns NULL and 1954 * attempts to restore the chain to its original state. 1955 * 1956 * Note that the resulting mbufs might be read-only, because the new 1957 * mbuf can end up sharing an mbuf cluster with the original mbuf if 1958 * the "breaking point" happens to lie within a cluster mbuf. Use the 1959 * M_WRITABLE() macro to check for this case. 1960 */ 1961 struct mbuf * 1962 m_split(struct mbuf *m0, int len0, int wait) 1963 { 1964 struct mbuf *m, *n; 1965 unsigned len = len0, remain; 1966 1967 for (m = m0; m && len > m->m_len; m = m->m_next) 1968 len -= m->m_len; 1969 if (m == NULL) 1970 return (NULL); 1971 remain = m->m_len - len; 1972 if (m0->m_flags & M_PKTHDR) { 1973 n = m_gethdr(wait, m0->m_type); 1974 if (n == NULL) 1975 return (NULL); 1976 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1977 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1978 m0->m_pkthdr.len = len0; 1979 if (m->m_flags & M_EXT) 1980 goto extpacket; 1981 if (remain > MHLEN) { 1982 /* m can't be the lead packet */ 1983 MH_ALIGN(n, 0); 1984 n->m_next = m_split(m, len, wait); 1985 if (n->m_next == NULL) { 1986 m_free(n); 1987 return (NULL); 1988 } else { 1989 n->m_len = 0; 1990 return (n); 1991 } 1992 } else 1993 MH_ALIGN(n, remain); 1994 } else if (remain == 0) { 1995 n = m->m_next; 1996 m->m_next = NULL; 1997 return (n); 1998 } else { 1999 n = m_get(wait, m->m_type); 2000 if (n == NULL) 2001 return (NULL); 2002 M_ALIGN(n, remain); 2003 } 2004 extpacket: 2005 if (m->m_flags & M_EXT) { 2006 KKASSERT((n->m_flags & M_EXT) == 0); 2007 n->m_data = m->m_data + len; 2008 m->m_ext.ext_ref(m->m_ext.ext_arg); 2009 n->m_ext = m->m_ext; 2010 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER); 2011 } else { 2012 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); 2013 } 2014 n->m_len = remain; 2015 m->m_len = len; 2016 n->m_next = m->m_next; 2017 m->m_next = NULL; 2018 return (n); 2019 } 2020 2021 /* 2022 * Routine to copy from device local memory into mbufs. 2023 * Note: "offset" is ill-defined and always called as 0, so ignore it. 2024 */ 2025 struct mbuf * 2026 m_devget(char *buf, int len, int offset, struct ifnet *ifp, 2027 void (*copy)(volatile const void *from, volatile void *to, size_t length)) 2028 { 2029 struct mbuf *m, *mfirst = NULL, **mtail; 2030 int nsize, flags; 2031 2032 if (copy == NULL) 2033 copy = bcopy; 2034 mtail = &mfirst; 2035 flags = M_PKTHDR; 2036 2037 while (len > 0) { 2038 m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize); 2039 if (m == NULL) { 2040 m_freem(mfirst); 2041 return (NULL); 2042 } 2043 m->m_len = min(len, nsize); 2044 2045 if (flags & M_PKTHDR) { 2046 if (len + max_linkhdr <= nsize) 2047 m->m_data += max_linkhdr; 2048 m->m_pkthdr.rcvif = ifp; 2049 m->m_pkthdr.len = len; 2050 flags = 0; 2051 } 2052 2053 copy(buf, m->m_data, (unsigned)m->m_len); 2054 buf += m->m_len; 2055 len -= m->m_len; 2056 *mtail = m; 2057 mtail = &m->m_next; 2058 } 2059 2060 return (mfirst); 2061 } 2062 2063 /* 2064 * Routine to pad mbuf to the specified length 'padto'. 2065 */ 2066 int 2067 m_devpad(struct mbuf *m, int padto) 2068 { 2069 struct mbuf *last = NULL; 2070 int padlen; 2071 2072 if (padto <= m->m_pkthdr.len) 2073 return 0; 2074 2075 padlen = padto - m->m_pkthdr.len; 2076 2077 /* if there's only the packet-header and we can pad there, use it. */ 2078 if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) { 2079 last = m; 2080 } else { 2081 /* 2082 * Walk packet chain to find last mbuf. We will either 2083 * pad there, or append a new mbuf and pad it 2084 */ 2085 for (last = m; last->m_next != NULL; last = last->m_next) 2086 ; /* EMPTY */ 2087 2088 /* `last' now points to last in chain. */ 2089 if (M_TRAILINGSPACE(last) < padlen) { 2090 struct mbuf *n; 2091 2092 /* Allocate new empty mbuf, pad it. Compact later. */ 2093 MGET(n, MB_DONTWAIT, MT_DATA); 2094 if (n == NULL) 2095 return ENOBUFS; 2096 n->m_len = 0; 2097 last->m_next = n; 2098 last = n; 2099 } 2100 } 2101 KKASSERT(M_TRAILINGSPACE(last) >= padlen); 2102 KKASSERT(M_WRITABLE(last)); 2103 2104 /* Now zero the pad area */ 2105 bzero(mtod(last, char *) + last->m_len, padlen); 2106 last->m_len += padlen; 2107 m->m_pkthdr.len += padlen; 2108 return 0; 2109 } 2110 2111 /* 2112 * Copy data from a buffer back into the indicated mbuf chain, 2113 * starting "off" bytes from the beginning, extending the mbuf 2114 * chain if necessary. 2115 */ 2116 void 2117 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp) 2118 { 2119 int mlen; 2120 struct mbuf *m = m0, *n; 2121 int totlen = 0; 2122 2123 if (m0 == NULL) 2124 return; 2125 while (off > (mlen = m->m_len)) { 2126 off -= mlen; 2127 totlen += mlen; 2128 if (m->m_next == NULL) { 2129 n = m_getclr(MB_DONTWAIT, m->m_type); 2130 if (n == NULL) 2131 goto out; 2132 n->m_len = min(MLEN, len + off); 2133 m->m_next = n; 2134 } 2135 m = m->m_next; 2136 } 2137 while (len > 0) { 2138 mlen = min (m->m_len - off, len); 2139 bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); 2140 cp += mlen; 2141 len -= mlen; 2142 mlen += off; 2143 off = 0; 2144 totlen += mlen; 2145 if (len == 0) 2146 break; 2147 if (m->m_next == NULL) { 2148 n = m_get(MB_DONTWAIT, m->m_type); 2149 if (n == NULL) 2150 break; 2151 n->m_len = min(MLEN, len); 2152 m->m_next = n; 2153 } 2154 m = m->m_next; 2155 } 2156 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) 2157 m->m_pkthdr.len = totlen; 2158 } 2159 2160 /* 2161 * Append the specified data to the indicated mbuf chain, 2162 * Extend the mbuf chain if the new data does not fit in 2163 * existing space. 2164 * 2165 * Return 1 if able to complete the job; otherwise 0. 2166 */ 2167 int 2168 m_append(struct mbuf *m0, int len, c_caddr_t cp) 2169 { 2170 struct mbuf *m, *n; 2171 int remainder, space; 2172 2173 for (m = m0; m->m_next != NULL; m = m->m_next) 2174 ; 2175 remainder = len; 2176 space = M_TRAILINGSPACE(m); 2177 if (space > 0) { 2178 /* 2179 * Copy into available space. 2180 */ 2181 if (space > remainder) 2182 space = remainder; 2183 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2184 m->m_len += space; 2185 cp += space, remainder -= space; 2186 } 2187 while (remainder > 0) { 2188 /* 2189 * Allocate a new mbuf; could check space 2190 * and allocate a cluster instead. 2191 */ 2192 n = m_get(MB_DONTWAIT, m->m_type); 2193 if (n == NULL) 2194 break; 2195 n->m_len = min(MLEN, remainder); 2196 bcopy(cp, mtod(n, caddr_t), n->m_len); 2197 cp += n->m_len, remainder -= n->m_len; 2198 m->m_next = n; 2199 m = n; 2200 } 2201 if (m0->m_flags & M_PKTHDR) 2202 m0->m_pkthdr.len += len - remainder; 2203 return (remainder == 0); 2204 } 2205 2206 /* 2207 * Apply function f to the data in an mbuf chain starting "off" bytes from 2208 * the beginning, continuing for "len" bytes. 2209 */ 2210 int 2211 m_apply(struct mbuf *m, int off, int len, 2212 int (*f)(void *, void *, u_int), void *arg) 2213 { 2214 u_int count; 2215 int rval; 2216 2217 KASSERT(off >= 0, ("m_apply, negative off %d", off)); 2218 KASSERT(len >= 0, ("m_apply, negative len %d", len)); 2219 while (off > 0) { 2220 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 2221 if (off < m->m_len) 2222 break; 2223 off -= m->m_len; 2224 m = m->m_next; 2225 } 2226 while (len > 0) { 2227 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 2228 count = min(m->m_len - off, len); 2229 rval = (*f)(arg, mtod(m, caddr_t) + off, count); 2230 if (rval) 2231 return (rval); 2232 len -= count; 2233 off = 0; 2234 m = m->m_next; 2235 } 2236 return (0); 2237 } 2238 2239 /* 2240 * Return a pointer to mbuf/offset of location in mbuf chain. 2241 */ 2242 struct mbuf * 2243 m_getptr(struct mbuf *m, int loc, int *off) 2244 { 2245 2246 while (loc >= 0) { 2247 /* Normal end of search. */ 2248 if (m->m_len > loc) { 2249 *off = loc; 2250 return (m); 2251 } else { 2252 loc -= m->m_len; 2253 if (m->m_next == NULL) { 2254 if (loc == 0) { 2255 /* Point at the end of valid data. */ 2256 *off = m->m_len; 2257 return (m); 2258 } 2259 return (NULL); 2260 } 2261 m = m->m_next; 2262 } 2263 } 2264 return (NULL); 2265 } 2266 2267 void 2268 m_print(const struct mbuf *m) 2269 { 2270 int len; 2271 const struct mbuf *m2; 2272 2273 len = m->m_pkthdr.len; 2274 m2 = m; 2275 while (len) { 2276 kprintf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-"); 2277 len -= m2->m_len; 2278 m2 = m2->m_next; 2279 } 2280 return; 2281 } 2282 2283 /* 2284 * "Move" mbuf pkthdr from "from" to "to". 2285 * "from" must have M_PKTHDR set, and "to" must be empty. 2286 */ 2287 void 2288 m_move_pkthdr(struct mbuf *to, struct mbuf *from) 2289 { 2290 KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header")); 2291 2292 to->m_flags |= from->m_flags & M_COPYFLAGS; 2293 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 2294 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ 2295 } 2296 2297 /* 2298 * Duplicate "from"'s mbuf pkthdr in "to". 2299 * "from" must have M_PKTHDR set, and "to" must be empty. 2300 * In particular, this does a deep copy of the packet tags. 2301 */ 2302 int 2303 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) 2304 { 2305 KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header")); 2306 2307 to->m_flags = (from->m_flags & M_COPYFLAGS) | 2308 (to->m_flags & ~M_COPYFLAGS); 2309 to->m_pkthdr = from->m_pkthdr; 2310 SLIST_INIT(&to->m_pkthdr.tags); 2311 return (m_tag_copy_chain(to, from, how)); 2312 } 2313 2314 /* 2315 * Defragment a mbuf chain, returning the shortest possible 2316 * chain of mbufs and clusters. If allocation fails and 2317 * this cannot be completed, NULL will be returned, but 2318 * the passed in chain will be unchanged. Upon success, 2319 * the original chain will be freed, and the new chain 2320 * will be returned. 2321 * 2322 * If a non-packet header is passed in, the original 2323 * mbuf (chain?) will be returned unharmed. 2324 * 2325 * m_defrag_nofree doesn't free the passed in mbuf. 2326 */ 2327 struct mbuf * 2328 m_defrag(struct mbuf *m0, int how) 2329 { 2330 struct mbuf *m_new; 2331 2332 if ((m_new = m_defrag_nofree(m0, how)) == NULL) 2333 return (NULL); 2334 if (m_new != m0) 2335 m_freem(m0); 2336 return (m_new); 2337 } 2338 2339 struct mbuf * 2340 m_defrag_nofree(struct mbuf *m0, int how) 2341 { 2342 struct mbuf *m_new = NULL, *m_final = NULL; 2343 int progress = 0, length, nsize; 2344 2345 if (!(m0->m_flags & M_PKTHDR)) 2346 return (m0); 2347 2348 #ifdef MBUF_STRESS_TEST 2349 if (m_defragrandomfailures) { 2350 int temp = karc4random() & 0xff; 2351 if (temp == 0xba) 2352 goto nospace; 2353 } 2354 #endif 2355 2356 m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize); 2357 if (m_final == NULL) 2358 goto nospace; 2359 m_final->m_len = 0; /* in case m0->m_pkthdr.len is zero */ 2360 2361 if (m_dup_pkthdr(m_final, m0, how) == 0) 2362 goto nospace; 2363 2364 m_new = m_final; 2365 2366 while (progress < m0->m_pkthdr.len) { 2367 length = m0->m_pkthdr.len - progress; 2368 if (length > MCLBYTES) 2369 length = MCLBYTES; 2370 2371 if (m_new == NULL) { 2372 m_new = m_getl(length, how, MT_DATA, 0, &nsize); 2373 if (m_new == NULL) 2374 goto nospace; 2375 } 2376 2377 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 2378 progress += length; 2379 m_new->m_len = length; 2380 if (m_new != m_final) 2381 m_cat(m_final, m_new); 2382 m_new = NULL; 2383 } 2384 if (m0->m_next == NULL) 2385 m_defraguseless++; 2386 m_defragpackets++; 2387 m_defragbytes += m_final->m_pkthdr.len; 2388 return (m_final); 2389 nospace: 2390 m_defragfailure++; 2391 if (m_new) 2392 m_free(m_new); 2393 m_freem(m_final); 2394 return (NULL); 2395 } 2396 2397 /* 2398 * Move data from uio into mbufs. 2399 */ 2400 struct mbuf * 2401 m_uiomove(struct uio *uio) 2402 { 2403 struct mbuf *m; /* current working mbuf */ 2404 struct mbuf *head = NULL; /* result mbuf chain */ 2405 struct mbuf **mp = &head; 2406 int flags = M_PKTHDR; 2407 int nsize; 2408 int error; 2409 int resid; 2410 2411 do { 2412 if (uio->uio_resid > INT_MAX) 2413 resid = INT_MAX; 2414 else 2415 resid = (int)uio->uio_resid; 2416 m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize); 2417 if (flags) { 2418 m->m_pkthdr.len = 0; 2419 /* Leave room for protocol headers. */ 2420 if (resid < MHLEN) 2421 MH_ALIGN(m, resid); 2422 flags = 0; 2423 } 2424 m->m_len = imin(nsize, resid); 2425 error = uiomove(mtod(m, caddr_t), m->m_len, uio); 2426 if (error) { 2427 m_free(m); 2428 goto failed; 2429 } 2430 *mp = m; 2431 mp = &m->m_next; 2432 head->m_pkthdr.len += m->m_len; 2433 } while (uio->uio_resid > 0); 2434 2435 return (head); 2436 2437 failed: 2438 m_freem(head); 2439 return (NULL); 2440 } 2441 2442 struct mbuf * 2443 m_last(struct mbuf *m) 2444 { 2445 while (m->m_next) 2446 m = m->m_next; 2447 return (m); 2448 } 2449 2450 /* 2451 * Return the number of bytes in an mbuf chain. 2452 * If lastm is not NULL, also return the last mbuf. 2453 */ 2454 u_int 2455 m_lengthm(struct mbuf *m, struct mbuf **lastm) 2456 { 2457 u_int len = 0; 2458 struct mbuf *prev = m; 2459 2460 while (m) { 2461 len += m->m_len; 2462 prev = m; 2463 m = m->m_next; 2464 } 2465 if (lastm != NULL) 2466 *lastm = prev; 2467 return (len); 2468 } 2469 2470 /* 2471 * Like m_lengthm(), except also keep track of mbuf usage. 2472 */ 2473 u_int 2474 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt) 2475 { 2476 u_int len = 0, mbcnt = 0; 2477 struct mbuf *prev = m; 2478 2479 while (m) { 2480 len += m->m_len; 2481 mbcnt += MSIZE; 2482 if (m->m_flags & M_EXT) 2483 mbcnt += m->m_ext.ext_size; 2484 prev = m; 2485 m = m->m_next; 2486 } 2487 if (lastm != NULL) 2488 *lastm = prev; 2489 *pmbcnt = mbcnt; 2490 return (len); 2491 } 2492