xref: /dragonfly/sys/kern/uipc_mbuf.c (revision ceb127be)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
5  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Jeffrey M. Hsu.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * Copyright (c) 1982, 1986, 1988, 1991, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  *
64  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
65  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
66  */
67 
68 #include "opt_param.h"
69 #include "opt_mbuf_stress_test.h"
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/file.h>
73 #include <sys/malloc.h>
74 #include <sys/mbuf.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/domain.h>
78 #include <sys/objcache.h>
79 #include <sys/tree.h>
80 #include <sys/protosw.h>
81 #include <sys/uio.h>
82 #include <sys/thread.h>
83 #include <sys/globaldata.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/spinlock2.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/limits.h>
90 
91 #include <vm/vm.h>
92 #include <vm/vm_kern.h>
93 #include <vm/vm_extern.h>
94 
95 #ifdef INVARIANTS
96 #include <machine/cpu.h>
97 #endif
98 
99 /*
100  * mbuf cluster meta-data
101  */
102 struct mbcluster {
103 	int32_t	mcl_refs;
104 	void	*mcl_data;
105 };
106 
107 /*
108  * mbuf tracking for debugging purposes
109  */
110 #ifdef MBUF_DEBUG
111 
112 static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack");
113 
114 struct mbctrack;
115 RB_HEAD(mbuf_rb_tree, mbtrack);
116 RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *);
117 
118 struct mbtrack {
119 	RB_ENTRY(mbtrack) rb_node;
120 	int trackid;
121 	struct mbuf *m;
122 };
123 
124 static int
125 mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2)
126 {
127 	if (mb1->m < mb2->m)
128 		return(-1);
129 	if (mb1->m > mb2->m)
130 		return(1);
131 	return(0);
132 }
133 
134 RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m);
135 
136 struct mbuf_rb_tree	mbuf_track_root;
137 static struct spinlock	mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin, "mbuf_track_spin");
138 
139 static void
140 mbuftrack(struct mbuf *m)
141 {
142 	struct mbtrack *mbt;
143 
144 	mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO);
145 	spin_lock(&mbuf_track_spin);
146 	mbt->m = m;
147 	if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) {
148 		spin_unlock(&mbuf_track_spin);
149 		panic("mbuftrack: mbuf %p already being tracked", m);
150 	}
151 	spin_unlock(&mbuf_track_spin);
152 }
153 
154 static void
155 mbufuntrack(struct mbuf *m)
156 {
157 	struct mbtrack *mbt;
158 
159 	spin_lock(&mbuf_track_spin);
160 	mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
161 	if (mbt == NULL) {
162 		spin_unlock(&mbuf_track_spin);
163 		panic("mbufuntrack: mbuf %p was not tracked", m);
164 	} else {
165 		mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt);
166 		spin_unlock(&mbuf_track_spin);
167 		kfree(mbt, M_MTRACK);
168 	}
169 }
170 
171 void
172 mbuftrackid(struct mbuf *m, int trackid)
173 {
174 	struct mbtrack *mbt;
175 	struct mbuf *n;
176 
177 	spin_lock(&mbuf_track_spin);
178 	while (m) {
179 		n = m->m_nextpkt;
180 		while (m) {
181 			mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
182 			if (mbt == NULL) {
183 				spin_unlock(&mbuf_track_spin);
184 				panic("mbuftrackid: mbuf %p not tracked", m);
185 			}
186 			mbt->trackid = trackid;
187 			m = m->m_next;
188 		}
189 		m = n;
190 	}
191 	spin_unlock(&mbuf_track_spin);
192 }
193 
194 static int
195 mbuftrack_callback(struct mbtrack *mbt, void *arg)
196 {
197 	struct sysctl_req *req = arg;
198 	char buf[64];
199 	int error;
200 
201 	ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid);
202 
203 	spin_unlock(&mbuf_track_spin);
204 	error = SYSCTL_OUT(req, buf, strlen(buf));
205 	spin_lock(&mbuf_track_spin);
206 	if (error)
207 		return(-error);
208 	return(0);
209 }
210 
211 static int
212 mbuftrack_show(SYSCTL_HANDLER_ARGS)
213 {
214 	int error;
215 
216 	spin_lock(&mbuf_track_spin);
217 	error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL,
218 				     mbuftrack_callback, req);
219 	spin_unlock(&mbuf_track_spin);
220 	return (-error);
221 }
222 SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING,
223 	    0, 0, mbuftrack_show, "A", "Show all in-use mbufs");
224 
225 #else
226 
227 #define mbuftrack(m)
228 #define mbufuntrack(m)
229 
230 #endif
231 
232 static void mbinit(void *);
233 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL);
234 
235 struct mbtypes_stat {
236 	u_long	stats[MT_NTYPES];
237 } __cachealign;
238 
239 static struct mbtypes_stat	mbtypes[SMP_MAXCPU];
240 
241 static struct mbstat mbstat[SMP_MAXCPU] __cachealign;
242 int	max_linkhdr;
243 int	max_protohdr;
244 int	max_hdr;
245 int	max_datalen;
246 int	m_defragpackets;
247 int	m_defragbytes;
248 int	m_defraguseless;
249 int	m_defragfailure;
250 #ifdef MBUF_STRESS_TEST
251 int	m_defragrandomfailures;
252 #endif
253 
254 struct objcache *mbuf_cache, *mbufphdr_cache;
255 struct objcache *mclmeta_cache, *mjclmeta_cache;
256 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
257 struct objcache *mbufjcluster_cache, *mbufphdrjcluster_cache;
258 
259 struct lock	mbupdate_lk = LOCK_INITIALIZER("mbupdate", 0, LK_CANRECURSE);
260 
261 int		nmbclusters;
262 static int	nmbjclusters;
263 int		nmbufs;
264 
265 static int	mjclph_cachefrac;
266 static int	mjcl_cachefrac;
267 static int	mclph_cachefrac;
268 static int	mcl_cachefrac;
269 
270 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
271 	&max_linkhdr, 0, "Max size of a link-level header");
272 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
273 	&max_protohdr, 0, "Max size of a protocol header");
274 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0,
275 	"Max size of link+protocol headers");
276 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
277 	&max_datalen, 0, "Max data payload size without headers");
278 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
279 	&mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations");
280 static int do_mbstat(SYSCTL_HANDLER_ARGS);
281 
282 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD,
283 	0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics");
284 
285 static int do_mbtypes(SYSCTL_HANDLER_ARGS);
286 
287 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD,
288 	0, 0, do_mbtypes, "LU", "");
289 
290 static int
291 do_mbstat(SYSCTL_HANDLER_ARGS)
292 {
293 	struct mbstat mbstat_total;
294 	struct mbstat *mbstat_totalp;
295 	int i;
296 
297 	bzero(&mbstat_total, sizeof(mbstat_total));
298 	mbstat_totalp = &mbstat_total;
299 
300 	for (i = 0; i < ncpus; i++) {
301 		mbstat_total.m_mbufs += mbstat[i].m_mbufs;
302 		mbstat_total.m_clusters += mbstat[i].m_clusters;
303 		mbstat_total.m_jclusters += mbstat[i].m_jclusters;
304 		mbstat_total.m_clfree += mbstat[i].m_clfree;
305 		mbstat_total.m_drops += mbstat[i].m_drops;
306 		mbstat_total.m_wait += mbstat[i].m_wait;
307 		mbstat_total.m_drain += mbstat[i].m_drain;
308 		mbstat_total.m_mcfail += mbstat[i].m_mcfail;
309 		mbstat_total.m_mpfail += mbstat[i].m_mpfail;
310 
311 	}
312 	/*
313 	 * The following fields are not cumulative fields so just
314 	 * get their values once.
315 	 */
316 	mbstat_total.m_msize = mbstat[0].m_msize;
317 	mbstat_total.m_mclbytes = mbstat[0].m_mclbytes;
318 	mbstat_total.m_minclsize = mbstat[0].m_minclsize;
319 	mbstat_total.m_mlen = mbstat[0].m_mlen;
320 	mbstat_total.m_mhlen = mbstat[0].m_mhlen;
321 
322 	return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req));
323 }
324 
325 static int
326 do_mbtypes(SYSCTL_HANDLER_ARGS)
327 {
328 	u_long totals[MT_NTYPES];
329 	int i, j;
330 
331 	for (i = 0; i < MT_NTYPES; i++)
332 		totals[i] = 0;
333 
334 	for (i = 0; i < ncpus; i++) {
335 		for (j = 0; j < MT_NTYPES; j++)
336 			totals[j] += mbtypes[i].stats[j];
337 	}
338 
339 	return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req));
340 }
341 
342 /*
343  * The variables may be set as boot-time tunables or live.  Setting these
344  * values too low can deadlock your network.  Network interfaces may also
345  * adjust nmbclusters and/or nmbjclusters to account for preloading the
346  * hardware rings.
347  */
348 static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS);
349 static int sysctl_nmbjclusters(SYSCTL_HANDLER_ARGS);
350 static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS);
351 SYSCTL_PROC(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLTYPE_INT | CTLFLAG_RW,
352 	   0, 0, sysctl_nmbclusters, "I",
353 	   "Maximum number of mbuf clusters available");
354 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjclusters, CTLTYPE_INT | CTLFLAG_RW,
355 	   0, 0, sysctl_nmbjclusters, "I",
356 	   "Maximum number of mbuf jclusters available");
357 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT | CTLFLAG_RW,
358 	   0, 0, sysctl_nmbufs, "I",
359 	   "Maximum number of mbufs available");
360 
361 SYSCTL_INT(_kern_ipc, OID_AUTO, mjclph_cachefrac, CTLFLAG_RD,
362 	   &mjclph_cachefrac, 0,
363 	   "Fraction of cacheable mbuf jclusters w/ pkthdr");
364 SYSCTL_INT(_kern_ipc, OID_AUTO, mjcl_cachefrac, CTLFLAG_RD,
365 	   &mjcl_cachefrac, 0,
366 	   "Fraction of cacheable mbuf jclusters");
367 SYSCTL_INT(_kern_ipc, OID_AUTO, mclph_cachefrac, CTLFLAG_RD,
368     	   &mclph_cachefrac, 0,
369 	   "Fraction of cacheable mbuf clusters w/ pkthdr");
370 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_cachefrac, CTLFLAG_RD,
371     	   &mcl_cachefrac, 0, "Fraction of cacheable mbuf clusters");
372 
373 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
374 	   &m_defragpackets, 0, "Number of defragment packets");
375 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
376 	   &m_defragbytes, 0, "Number of defragment bytes");
377 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
378 	   &m_defraguseless, 0, "Number of useless defragment mbuf chain operations");
379 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
380 	   &m_defragfailure, 0, "Number of failed defragment mbuf chain operations");
381 #ifdef MBUF_STRESS_TEST
382 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
383 	   &m_defragrandomfailures, 0, "");
384 #endif
385 
386 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
387 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
388 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
389 
390 static void m_reclaim (void);
391 static void m_mclref(void *arg);
392 static void m_mclfree(void *arg);
393 static void m_mjclfree(void *arg);
394 
395 static void mbupdatelimits(void);
396 
397 /*
398  * NOTE: Default NMBUFS must take into account a possible DOS attack
399  *	 using fd passing on unix domain sockets.
400  */
401 #ifndef NMBCLUSTERS
402 #define NMBCLUSTERS	(512 + maxusers * 16)
403 #endif
404 #ifndef MJCLPH_CACHEFRAC
405 #define MJCLPH_CACHEFRAC 16
406 #endif
407 #ifndef MJCL_CACHEFRAC
408 #define MJCL_CACHEFRAC	4
409 #endif
410 #ifndef MCLPH_CACHEFRAC
411 #define MCLPH_CACHEFRAC	16
412 #endif
413 #ifndef MCL_CACHEFRAC
414 #define MCL_CACHEFRAC	4
415 #endif
416 #ifndef NMBJCLUSTERS
417 #define NMBJCLUSTERS	(NMBCLUSTERS / 2)
418 #endif
419 #ifndef NMBUFS
420 #define NMBUFS		(nmbclusters * 2 + maxfiles)
421 #endif
422 
423 #define NMBCLUSTERS_MIN	(NMBCLUSTERS / 2)
424 #define NMBJCLUSTERS_MIN (NMBJCLUSTERS / 2)
425 #define NMBUFS_MIN	((NMBCLUSTERS * 2 + maxfiles) / 2)
426 
427 /*
428  * Perform sanity checks of tunables declared above.
429  */
430 static void
431 tunable_mbinit(void *dummy)
432 {
433 	/*
434 	 * This has to be done before VM init.
435 	 */
436 	nmbclusters = NMBCLUSTERS;
437 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
438 	mjclph_cachefrac = MJCLPH_CACHEFRAC;
439 	TUNABLE_INT_FETCH("kern.ipc.mjclph_cachefrac", &mjclph_cachefrac);
440 	mjcl_cachefrac = MJCL_CACHEFRAC;
441 	TUNABLE_INT_FETCH("kern.ipc.mjcl_cachefrac", &mjcl_cachefrac);
442 	mclph_cachefrac = MCLPH_CACHEFRAC;
443 	TUNABLE_INT_FETCH("kern.ipc.mclph_cachefrac", &mclph_cachefrac);
444 	mcl_cachefrac = MCL_CACHEFRAC;
445 	TUNABLE_INT_FETCH("kern.ipc.mcl_cachefrac", &mcl_cachefrac);
446 
447 	/*
448 	 * WARNING! each mcl cache feeds two mbuf caches, so the minimum
449 	 *	    cachefrac is 2.  For safety, use 3.
450 	 */
451 	if (mjclph_cachefrac < 3)
452 		mjclph_cachefrac = 3;
453 	if (mjcl_cachefrac < 3)
454 		mjcl_cachefrac = 3;
455 	if (mclph_cachefrac < 3)
456 		mclph_cachefrac = 3;
457 	if (mcl_cachefrac < 3)
458 		mcl_cachefrac = 3;
459 
460 	nmbjclusters = NMBJCLUSTERS;
461 	TUNABLE_INT_FETCH("kern.ipc.nmbjclusters", &nmbjclusters);
462 
463 	nmbufs = NMBUFS;
464 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
465 
466 	/* Sanity checks */
467 	if (nmbufs < nmbclusters * 2)
468 		nmbufs = nmbclusters * 2;
469 }
470 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
471 	tunable_mbinit, NULL);
472 
473 static void
474 mbinclimit(int *limit, int inc, int minlim)
475 {
476 	int new_limit;
477 
478 	lockmgr(&mbupdate_lk, LK_EXCLUSIVE);
479 
480 	new_limit = *limit + inc;
481 	if (new_limit < minlim)
482 		new_limit = minlim;
483 
484 	if (*limit != new_limit) {
485 		*limit = new_limit;
486 		mbupdatelimits();
487 	}
488 
489 	lockmgr(&mbupdate_lk, LK_RELEASE);
490 }
491 
492 static int
493 mbsetlimit(int *limit, int new_limit, int minlim)
494 {
495 	if (new_limit < minlim)
496 		return EINVAL;
497 
498 	lockmgr(&mbupdate_lk, LK_EXCLUSIVE);
499 	mbinclimit(limit, new_limit - *limit, minlim);
500 	lockmgr(&mbupdate_lk, LK_RELEASE);
501 	return 0;
502 }
503 
504 static int
505 sysctl_mblimit(SYSCTL_HANDLER_ARGS, int *limit, int minlim)
506 {
507 	int error, value;
508 
509 	value = *limit;
510 	error = sysctl_handle_int(oidp, &value, 0, req);
511 	if (error || req->newptr == NULL)
512 		return error;
513 
514 	return mbsetlimit(limit, value, minlim);
515 }
516 
517 /*
518  * Sysctl support to update nmbclusters, nmbjclusters, and nmbufs.
519  */
520 static int
521 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
522 {
523 	return sysctl_mblimit(oidp, arg1, arg2, req, &nmbclusters,
524 	    NMBCLUSTERS_MIN);
525 }
526 
527 static int
528 sysctl_nmbjclusters(SYSCTL_HANDLER_ARGS)
529 {
530 	return sysctl_mblimit(oidp, arg1, arg2, req, &nmbjclusters,
531 	    NMBJCLUSTERS_MIN);
532 }
533 
534 static int
535 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
536 {
537 	return sysctl_mblimit(oidp, arg1, arg2, req, &nmbufs, NMBUFS_MIN);
538 }
539 
540 void
541 mcl_inclimit(int inc)
542 {
543 	mbinclimit(&nmbclusters, inc, NMBCLUSTERS_MIN);
544 }
545 
546 void
547 mjcl_inclimit(int inc)
548 {
549 	mbinclimit(&nmbjclusters, inc, NMBJCLUSTERS_MIN);
550 }
551 
552 void
553 mb_inclimit(int inc)
554 {
555 	mbinclimit(&nmbufs, inc, NMBUFS_MIN);
556 }
557 
558 /* "number of clusters of pages" */
559 #define NCL_INIT	1
560 
561 #define NMB_INIT	16
562 
563 /*
564  * The mbuf object cache only guarantees that m_next and m_nextpkt are
565  * NULL and that m_data points to the beginning of the data area.  In
566  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
567  * responsibility of the caller to initialize those fields before use.
568  */
569 static __inline boolean_t
570 mbuf_ctor(void *obj, void *private, int ocflags)
571 {
572 	struct mbuf *m = obj;
573 
574 	m->m_next = NULL;
575 	m->m_nextpkt = NULL;
576 	m->m_data = m->m_dat;
577 	m->m_flags = 0;
578 
579 	return (TRUE);
580 }
581 
582 /*
583  * Initialize the mbuf and the packet header fields.
584  */
585 static boolean_t
586 mbufphdr_ctor(void *obj, void *private, int ocflags)
587 {
588 	struct mbuf *m = obj;
589 
590 	m->m_next = NULL;
591 	m->m_nextpkt = NULL;
592 	m->m_data = m->m_pktdat;
593 	m->m_flags = M_PKTHDR | M_PHCACHE;
594 
595 	m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
596 	SLIST_INIT(&m->m_pkthdr.tags);
597 	m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
598 	m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
599 
600 	return (TRUE);
601 }
602 
603 /*
604  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
605  */
606 static boolean_t
607 mclmeta_ctor(void *obj, void *private, int ocflags)
608 {
609 	struct mbcluster *cl = obj;
610 	void *buf;
611 
612 	if (ocflags & M_NOWAIT)
613 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
614 	else
615 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
616 	if (buf == NULL)
617 		return (FALSE);
618 	cl->mcl_refs = 0;
619 	cl->mcl_data = buf;
620 	return (TRUE);
621 }
622 
623 static boolean_t
624 mjclmeta_ctor(void *obj, void *private, int ocflags)
625 {
626 	struct mbcluster *cl = obj;
627 	void *buf;
628 
629 	if (ocflags & M_NOWAIT)
630 		buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_NOWAIT | M_ZERO);
631 	else
632 		buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_INTWAIT | M_ZERO);
633 	if (buf == NULL)
634 		return (FALSE);
635 	cl->mcl_refs = 0;
636 	cl->mcl_data = buf;
637 	return (TRUE);
638 }
639 
640 static void
641 mclmeta_dtor(void *obj, void *private)
642 {
643 	struct mbcluster *mcl = obj;
644 
645 	KKASSERT(mcl->mcl_refs == 0);
646 	kfree(mcl->mcl_data, M_MBUFCL);
647 }
648 
649 static void
650 linkjcluster(struct mbuf *m, struct mbcluster *cl, uint size)
651 {
652 	/*
653 	 * Add the cluster to the mbuf.  The caller will detect that the
654 	 * mbuf now has an attached cluster.
655 	 */
656 	m->m_ext.ext_arg = cl;
657 	m->m_ext.ext_buf = cl->mcl_data;
658 	m->m_ext.ext_ref = m_mclref;
659 	if (size != MCLBYTES)
660 		m->m_ext.ext_free = m_mjclfree;
661 	else
662 		m->m_ext.ext_free = m_mclfree;
663 	m->m_ext.ext_size = size;
664 	atomic_add_int(&cl->mcl_refs, 1);
665 
666 	m->m_data = m->m_ext.ext_buf;
667 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
668 }
669 
670 static void
671 linkcluster(struct mbuf *m, struct mbcluster *cl)
672 {
673 	linkjcluster(m, cl, MCLBYTES);
674 }
675 
676 static boolean_t
677 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
678 {
679 	struct mbuf *m = obj;
680 	struct mbcluster *cl;
681 
682 	mbufphdr_ctor(obj, private, ocflags);
683 	cl = objcache_get(mclmeta_cache, ocflags);
684 	if (cl == NULL) {
685 		++mbstat[mycpu->gd_cpuid].m_drops;
686 		return (FALSE);
687 	}
688 	m->m_flags |= M_CLCACHE;
689 	linkcluster(m, cl);
690 	return (TRUE);
691 }
692 
693 static boolean_t
694 mbufphdrjcluster_ctor(void *obj, void *private, int ocflags)
695 {
696 	struct mbuf *m = obj;
697 	struct mbcluster *cl;
698 
699 	mbufphdr_ctor(obj, private, ocflags);
700 	cl = objcache_get(mjclmeta_cache, ocflags);
701 	if (cl == NULL) {
702 		++mbstat[mycpu->gd_cpuid].m_drops;
703 		return (FALSE);
704 	}
705 	m->m_flags |= M_CLCACHE;
706 	linkjcluster(m, cl, MJUMPAGESIZE);
707 	return (TRUE);
708 }
709 
710 static boolean_t
711 mbufcluster_ctor(void *obj, void *private, int ocflags)
712 {
713 	struct mbuf *m = obj;
714 	struct mbcluster *cl;
715 
716 	mbuf_ctor(obj, private, ocflags);
717 	cl = objcache_get(mclmeta_cache, ocflags);
718 	if (cl == NULL) {
719 		++mbstat[mycpu->gd_cpuid].m_drops;
720 		return (FALSE);
721 	}
722 	m->m_flags |= M_CLCACHE;
723 	linkcluster(m, cl);
724 	return (TRUE);
725 }
726 
727 static boolean_t
728 mbufjcluster_ctor(void *obj, void *private, int ocflags)
729 {
730 	struct mbuf *m = obj;
731 	struct mbcluster *cl;
732 
733 	mbuf_ctor(obj, private, ocflags);
734 	cl = objcache_get(mjclmeta_cache, ocflags);
735 	if (cl == NULL) {
736 		++mbstat[mycpu->gd_cpuid].m_drops;
737 		return (FALSE);
738 	}
739 	m->m_flags |= M_CLCACHE;
740 	linkjcluster(m, cl, MJUMPAGESIZE);
741 	return (TRUE);
742 }
743 
744 /*
745  * Used for both the cluster and cluster PHDR caches.
746  *
747  * The mbuf may have lost its cluster due to sharing, deal
748  * with the situation by checking M_EXT.
749  */
750 static void
751 mbufcluster_dtor(void *obj, void *private)
752 {
753 	struct mbuf *m = obj;
754 	struct mbcluster *mcl;
755 
756 	if (m->m_flags & M_EXT) {
757 		KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
758 		mcl = m->m_ext.ext_arg;
759 		KKASSERT(mcl->mcl_refs == 1);
760 		mcl->mcl_refs = 0;
761 		if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES)
762 			objcache_put(mjclmeta_cache, mcl);
763 		else
764 			objcache_put(mclmeta_cache, mcl);
765 	}
766 }
767 
768 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
769 struct objcache_malloc_args mclmeta_malloc_args =
770 	{ sizeof(struct mbcluster), M_MCLMETA };
771 
772 /* ARGSUSED*/
773 static void
774 mbinit(void *dummy)
775 {
776 	int mb_limit, cl_limit, ncl_limit, jcl_limit;
777 	int limit;
778 	int i;
779 
780 	/*
781 	 * Initialize statistics
782 	 */
783 	for (i = 0; i < ncpus; i++) {
784 		mbstat[i].m_msize = MSIZE;
785 		mbstat[i].m_mclbytes = MCLBYTES;
786 		mbstat[i].m_mjumpagesize = MJUMPAGESIZE;
787 		mbstat[i].m_minclsize = MINCLSIZE;
788 		mbstat[i].m_mlen = MLEN;
789 		mbstat[i].m_mhlen = MHLEN;
790 	}
791 
792 	/*
793 	 * Create object caches and save cluster limits, which will
794 	 * be used to adjust backing kmalloc pools' limit later.
795 	 */
796 
797 	mb_limit = cl_limit = 0;
798 
799 	limit = nmbufs;
800 	mbuf_cache = objcache_create("mbuf",
801 	    limit, nmbufs / 4,
802 	    mbuf_ctor, NULL, NULL,
803 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
804 	mb_limit += limit;
805 
806 	limit = nmbufs;
807 	mbufphdr_cache = objcache_create("mbuf pkt hdr",
808 	    limit, nmbufs / 4,
809 	    mbufphdr_ctor, NULL, NULL,
810 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
811 	mb_limit += limit;
812 
813 	ncl_limit = nmbclusters;
814 	mclmeta_cache = objcache_create("cluster mbuf",
815 	    ncl_limit, nmbclusters / 4,
816 	    mclmeta_ctor, mclmeta_dtor, NULL,
817 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
818 	cl_limit += ncl_limit;
819 
820 	jcl_limit = nmbjclusters;
821 	mjclmeta_cache = objcache_create("jcluster mbuf",
822 	    jcl_limit, nmbjclusters / 4,
823 	    mjclmeta_ctor, mclmeta_dtor, NULL,
824 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
825 	cl_limit += jcl_limit;
826 
827 	limit = nmbclusters;
828 	mbufcluster_cache = objcache_create("mbuf + cluster",
829 	    limit, nmbclusters / mcl_cachefrac,
830 	    mbufcluster_ctor, mbufcluster_dtor, NULL,
831 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
832 	mb_limit += limit;
833 
834 	limit = nmbclusters;
835 	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
836 	    limit, nmbclusters / mclph_cachefrac,
837 	    mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
838 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
839 	mb_limit += limit;
840 
841 	limit = nmbjclusters;
842 	mbufjcluster_cache = objcache_create("mbuf + jcluster",
843 	    limit, nmbjclusters / mjcl_cachefrac,
844 	    mbufjcluster_ctor, mbufcluster_dtor, NULL,
845 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
846 	mb_limit += limit;
847 
848 	limit = nmbjclusters;
849 	mbufphdrjcluster_cache = objcache_create("mbuf pkt hdr + jcluster",
850 	    limit, nmbjclusters / mjclph_cachefrac,
851 	    mbufphdrjcluster_ctor, mbufcluster_dtor, NULL,
852 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
853 	mb_limit += limit;
854 
855 	/*
856 	 * Adjust backing kmalloc pools' limit
857 	 *
858 	 * NOTE: We raise the limit by another 1/8 to take the effect
859 	 * of loosememuse into account.
860 	 */
861 	cl_limit += cl_limit / 8;
862 	kmalloc_raise_limit(mclmeta_malloc_args.mtype,
863 			    mclmeta_malloc_args.objsize * (size_t)cl_limit);
864 	kmalloc_raise_limit(M_MBUFCL,
865 			    (MCLBYTES * (size_t)ncl_limit) +
866 			    (MJUMPAGESIZE * (size_t)jcl_limit));
867 
868 	mb_limit += mb_limit / 8;
869 	kmalloc_raise_limit(mbuf_malloc_args.mtype,
870 			    mbuf_malloc_args.objsize * (size_t)mb_limit);
871 }
872 
873 /*
874  * Adjust mbuf limits after changes have been made
875  *
876  * Caller must hold mbupdate_lk
877  */
878 static void
879 mbupdatelimits(void)
880 {
881 	int mb_limit, cl_limit, ncl_limit, jcl_limit;
882 	int limit;
883 
884 	KASSERT(lockstatus(&mbupdate_lk, curthread) != 0,
885 	    ("mbupdate_lk is not held"));
886 
887 	/*
888 	 * Figure out adjustments to object caches after nmbufs, nmbclusters,
889 	 * or nmbjclusters has been modified.
890 	 */
891 	mb_limit = cl_limit = 0;
892 
893 	limit = nmbufs;
894 	objcache_set_cluster_limit(mbuf_cache, limit);
895 	mb_limit += limit;
896 
897 	limit = nmbufs;
898 	objcache_set_cluster_limit(mbufphdr_cache, limit);
899 	mb_limit += limit;
900 
901 	ncl_limit = nmbclusters;
902 	objcache_set_cluster_limit(mclmeta_cache, ncl_limit);
903 	cl_limit += ncl_limit;
904 
905 	jcl_limit = nmbjclusters;
906 	objcache_set_cluster_limit(mjclmeta_cache, jcl_limit);
907 	cl_limit += jcl_limit;
908 
909 	limit = nmbclusters;
910 	objcache_set_cluster_limit(mbufcluster_cache, limit);
911 	mb_limit += limit;
912 
913 	limit = nmbclusters;
914 	objcache_set_cluster_limit(mbufphdrcluster_cache, limit);
915 	mb_limit += limit;
916 
917 	limit = nmbjclusters;
918 	objcache_set_cluster_limit(mbufjcluster_cache, limit);
919 	mb_limit += limit;
920 
921 	limit = nmbjclusters;
922 	objcache_set_cluster_limit(mbufphdrjcluster_cache, limit);
923 	mb_limit += limit;
924 
925 	/*
926 	 * Adjust backing kmalloc pools' limit
927 	 *
928 	 * NOTE: We raise the limit by another 1/8 to take the effect
929 	 * of loosememuse into account.
930 	 */
931 	cl_limit += cl_limit / 8;
932 	kmalloc_raise_limit(mclmeta_malloc_args.mtype,
933 			    mclmeta_malloc_args.objsize * (size_t)cl_limit);
934 	kmalloc_raise_limit(M_MBUFCL,
935 			    (MCLBYTES * (size_t)ncl_limit) +
936 			    (MJUMPAGESIZE * (size_t)jcl_limit));
937 	mb_limit += mb_limit / 8;
938 	kmalloc_raise_limit(mbuf_malloc_args.mtype,
939 			    mbuf_malloc_args.objsize * (size_t)mb_limit);
940 }
941 
942 /*
943  * Return the number of references to this mbuf's data.  0 is returned
944  * if the mbuf is not M_EXT, a reference count is returned if it is
945  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
946  */
947 int
948 m_sharecount(struct mbuf *m)
949 {
950 	switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
951 	case 0:
952 		return (0);
953 	case M_EXT:
954 		return (99);
955 	case M_EXT | M_EXT_CLUSTER:
956 		return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
957 	}
958 	/* NOTREACHED */
959 	return (0);		/* to shut up compiler */
960 }
961 
962 /*
963  * change mbuf to new type
964  */
965 void
966 m_chtype(struct mbuf *m, int type)
967 {
968 	struct globaldata *gd = mycpu;
969 
970 	++mbtypes[gd->gd_cpuid].stats[type];
971 	--mbtypes[gd->gd_cpuid].stats[m->m_type];
972 	m->m_type = type;
973 }
974 
975 static void
976 m_reclaim(void)
977 {
978 	struct domain *dp;
979 	struct protosw *pr;
980 
981 	kprintf("Debug: m_reclaim() called\n");
982 
983 	SLIST_FOREACH(dp, &domains, dom_next) {
984 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
985 			if (pr->pr_drain)
986 				(*pr->pr_drain)();
987 		}
988 	}
989 	++mbstat[mycpu->gd_cpuid].m_drain;
990 }
991 
992 static __inline void
993 updatestats(struct mbuf *m, int type)
994 {
995 	struct globaldata *gd = mycpu;
996 
997 	m->m_type = type;
998 	mbuftrack(m);
999 #ifdef MBUF_DEBUG
1000 	KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m));
1001 	KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m));
1002 #endif
1003 
1004 	++mbtypes[gd->gd_cpuid].stats[type];
1005 	++mbstat[gd->gd_cpuid].m_mbufs;
1006 
1007 }
1008 
1009 /*
1010  * Allocate an mbuf.
1011  */
1012 struct mbuf *
1013 m_get(int how, int type)
1014 {
1015 	struct mbuf *m;
1016 	int ntries = 0;
1017 	int ocf = MB_OCFLAG(how);
1018 
1019 retryonce:
1020 
1021 	m = objcache_get(mbuf_cache, ocf);
1022 
1023 	if (m == NULL) {
1024 		if ((ocf & M_WAITOK) && ntries++ == 0) {
1025 			struct objcache *reclaimlist[] = {
1026 				mbufphdr_cache,
1027 				mbufcluster_cache,
1028 				mbufphdrcluster_cache,
1029 				mbufjcluster_cache,
1030 				mbufphdrjcluster_cache
1031 			};
1032 			const int nreclaims = NELEM(reclaimlist);
1033 
1034 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
1035 				m_reclaim();
1036 			goto retryonce;
1037 		}
1038 		++mbstat[mycpu->gd_cpuid].m_drops;
1039 		return (NULL);
1040 	}
1041 #ifdef MBUF_DEBUG
1042 	KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m));
1043 #endif
1044 	m->m_len = 0;
1045 
1046 	updatestats(m, type);
1047 	return (m);
1048 }
1049 
1050 struct mbuf *
1051 m_gethdr(int how, int type)
1052 {
1053 	struct mbuf *m;
1054 	int ocf = MB_OCFLAG(how);
1055 	int ntries = 0;
1056 
1057 retryonce:
1058 
1059 	m = objcache_get(mbufphdr_cache, ocf);
1060 
1061 	if (m == NULL) {
1062 		if ((ocf & M_WAITOK) && ntries++ == 0) {
1063 			struct objcache *reclaimlist[] = {
1064 				mbuf_cache,
1065 				mbufcluster_cache, mbufphdrcluster_cache,
1066 				mbufjcluster_cache, mbufphdrjcluster_cache
1067 			};
1068 			const int nreclaims = NELEM(reclaimlist);
1069 
1070 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
1071 				m_reclaim();
1072 			goto retryonce;
1073 		}
1074 		++mbstat[mycpu->gd_cpuid].m_drops;
1075 		return (NULL);
1076 	}
1077 #ifdef MBUF_DEBUG
1078 	KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m));
1079 #endif
1080 	m->m_len = 0;
1081 	m->m_pkthdr.len = 0;
1082 
1083 	updatestats(m, type);
1084 	return (m);
1085 }
1086 
1087 /*
1088  * Get a mbuf (not a mbuf cluster!) and zero it.
1089  * Deprecated.
1090  */
1091 struct mbuf *
1092 m_getclr(int how, int type)
1093 {
1094 	struct mbuf *m;
1095 
1096 	m = m_get(how, type);
1097 	if (m != NULL)
1098 		bzero(m->m_data, MLEN);
1099 	return (m);
1100 }
1101 
1102 static struct mbuf *
1103 m_getcl_cache(int how, short type, int flags, struct objcache *mbclc,
1104     struct objcache *mbphclc, u_long *cl_stats)
1105 {
1106 	struct mbuf *m = NULL;
1107 	int ocflags = MB_OCFLAG(how);
1108 	int ntries = 0;
1109 
1110 retryonce:
1111 
1112 	if (flags & M_PKTHDR)
1113 		m = objcache_get(mbphclc, ocflags);
1114 	else
1115 		m = objcache_get(mbclc, ocflags);
1116 
1117 	if (m == NULL) {
1118 		if ((ocflags & M_WAITOK) && ntries++ == 0) {
1119 			struct objcache *reclaimlist[1];
1120 
1121 			if (flags & M_PKTHDR)
1122 				reclaimlist[0] = mbclc;
1123 			else
1124 				reclaimlist[0] = mbphclc;
1125 			if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
1126 				m_reclaim();
1127 			goto retryonce;
1128 		}
1129 		++mbstat[mycpu->gd_cpuid].m_drops;
1130 		return (NULL);
1131 	}
1132 
1133 #ifdef MBUF_DEBUG
1134 	KASSERT(m->m_data == m->m_ext.ext_buf,
1135 		("mbuf %p: bad m_data in get", m));
1136 #endif
1137 	m->m_type = type;
1138 	m->m_len = 0;
1139 	m->m_pkthdr.len = 0;	/* just do it unconditonally */
1140 
1141 	mbuftrack(m);
1142 
1143 	++mbtypes[mycpu->gd_cpuid].stats[type];
1144 	++(*cl_stats);
1145 	return (m);
1146 }
1147 
1148 struct mbuf *
1149 m_getjcl(int how, short type, int flags, size_t size)
1150 {
1151 	struct objcache *mbclc, *mbphclc;
1152 	u_long *cl_stats;
1153 
1154 	switch (size) {
1155 	case MCLBYTES:
1156 		mbclc = mbufcluster_cache;
1157 		mbphclc = mbufphdrcluster_cache;
1158 		cl_stats = &mbstat[mycpu->gd_cpuid].m_clusters;
1159 		break;
1160 
1161 	default:
1162 		mbclc = mbufjcluster_cache;
1163 		mbphclc = mbufphdrjcluster_cache;
1164 		cl_stats = &mbstat[mycpu->gd_cpuid].m_jclusters;
1165 		break;
1166 	}
1167 	return m_getcl_cache(how, type, flags, mbclc, mbphclc, cl_stats);
1168 }
1169 
1170 /*
1171  * Returns an mbuf with an attached cluster.
1172  * Because many network drivers use this kind of buffers a lot, it is
1173  * convenient to keep a small pool of free buffers of this kind.
1174  * Even a small size such as 10 gives about 10% improvement in the
1175  * forwarding rate in a bridge or router.
1176  */
1177 struct mbuf *
1178 m_getcl(int how, short type, int flags)
1179 {
1180 	return m_getcl_cache(how, type, flags,
1181 	    mbufcluster_cache, mbufphdrcluster_cache,
1182 	    &mbstat[mycpu->gd_cpuid].m_clusters);
1183 }
1184 
1185 /*
1186  * Allocate chain of requested length.
1187  */
1188 struct mbuf *
1189 m_getc(int len, int how, int type)
1190 {
1191 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
1192 	int nsize;
1193 
1194 	while (len > 0) {
1195 		n = m_getl(len, how, type, 0, &nsize);
1196 		if (n == NULL)
1197 			goto failed;
1198 		n->m_len = 0;
1199 		*ntail = n;
1200 		ntail = &n->m_next;
1201 		len -= nsize;
1202 	}
1203 	return (nfirst);
1204 
1205 failed:
1206 	m_freem(nfirst);
1207 	return (NULL);
1208 }
1209 
1210 /*
1211  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
1212  * and return a pointer to the head of the allocated chain. If m0 is
1213  * non-null, then we assume that it is a single mbuf or an mbuf chain to
1214  * which we want len bytes worth of mbufs and/or clusters attached, and so
1215  * if we succeed in allocating it, we will just return a pointer to m0.
1216  *
1217  * If we happen to fail at any point during the allocation, we will free
1218  * up everything we have already allocated and return NULL.
1219  *
1220  * Deprecated.  Use m_getc() and m_cat() instead.
1221  */
1222 struct mbuf *
1223 m_getm(struct mbuf *m0, int len, int type, int how)
1224 {
1225 	struct mbuf *nfirst;
1226 
1227 	nfirst = m_getc(len, how, type);
1228 
1229 	if (m0 != NULL) {
1230 		m_last(m0)->m_next = nfirst;
1231 		return (m0);
1232 	}
1233 
1234 	return (nfirst);
1235 }
1236 
1237 /*
1238  * Adds a cluster to a normal mbuf, M_EXT is set on success.
1239  * Deprecated.  Use m_getcl() instead.
1240  */
1241 void
1242 m_mclget(struct mbuf *m, int how)
1243 {
1244 	struct mbcluster *mcl;
1245 
1246 	KKASSERT((m->m_flags & M_EXT) == 0);
1247 	mcl = objcache_get(mclmeta_cache, MB_OCFLAG(how));
1248 	if (mcl != NULL) {
1249 		linkcluster(m, mcl);
1250 		++mbstat[mycpu->gd_cpuid].m_clusters;
1251 	} else {
1252 		++mbstat[mycpu->gd_cpuid].m_drops;
1253 	}
1254 }
1255 
1256 /*
1257  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
1258  * a reference to the cluster can ref it, so we are in no danger of
1259  * racing an add with a subtract.  But the operation must still be atomic
1260  * since multiple entities may have a reference on the cluster.
1261  *
1262  * m_mclfree() is almost the same but it must contend with two entities
1263  * freeing the cluster at the same time.
1264  */
1265 static void
1266 m_mclref(void *arg)
1267 {
1268 	struct mbcluster *mcl = arg;
1269 
1270 	atomic_add_int(&mcl->mcl_refs, 1);
1271 }
1272 
1273 /*
1274  * When dereferencing a cluster we have to deal with a N->0 race, where
1275  * N entities free their references simultaniously.  To do this we use
1276  * atomic_fetchadd_int().
1277  */
1278 static void
1279 m_mclfree(void *arg)
1280 {
1281 	struct mbcluster *mcl = arg;
1282 
1283 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1284 		--mbstat[mycpu->gd_cpuid].m_clusters;
1285 		objcache_put(mclmeta_cache, mcl);
1286 	}
1287 }
1288 
1289 static void
1290 m_mjclfree(void *arg)
1291 {
1292 	struct mbcluster *mcl = arg;
1293 
1294 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1295 		--mbstat[mycpu->gd_cpuid].m_jclusters;
1296 		objcache_put(mjclmeta_cache, mcl);
1297 	}
1298 }
1299 
1300 /*
1301  * Free a single mbuf and any associated external storage.  The successor,
1302  * if any, is returned.
1303  *
1304  * We do need to check non-first mbuf for m_aux, since some of existing
1305  * code does not call M_PREPEND properly.
1306  * (example: call to bpf_mtap from drivers)
1307  */
1308 
1309 #ifdef MBUF_DEBUG
1310 
1311 struct mbuf  *
1312 _m_free(struct mbuf *m, const char *func)
1313 
1314 #else
1315 
1316 struct mbuf *
1317 m_free(struct mbuf *m)
1318 
1319 #endif
1320 {
1321 	struct mbuf *n;
1322 	struct globaldata *gd = mycpu;
1323 
1324 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
1325 	KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m));
1326 	--mbtypes[gd->gd_cpuid].stats[m->m_type];
1327 
1328 	n = m->m_next;
1329 
1330 	/*
1331 	 * Make sure the mbuf is in constructed state before returning it
1332 	 * to the objcache.
1333 	 */
1334 	m->m_next = NULL;
1335 	mbufuntrack(m);
1336 #ifdef MBUF_DEBUG
1337 	m->m_hdr.mh_lastfunc = func;
1338 #endif
1339 #ifdef notyet
1340 	KKASSERT(m->m_nextpkt == NULL);
1341 #else
1342 	if (m->m_nextpkt != NULL) {
1343 		static int afewtimes = 10;
1344 
1345 		if (afewtimes-- > 0) {
1346 			kprintf("mfree: m->m_nextpkt != NULL\n");
1347 			print_backtrace(-1);
1348 		}
1349 		m->m_nextpkt = NULL;
1350 	}
1351 #endif
1352 	if (m->m_flags & M_PKTHDR) {
1353 		m_tag_delete_chain(m);		/* eliminate XXX JH */
1354 	}
1355 
1356 	m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
1357 
1358 	/*
1359 	 * Clean the M_PKTHDR state so we can return the mbuf to its original
1360 	 * cache.  This is based on the PHCACHE flag which tells us whether
1361 	 * the mbuf was originally allocated out of a packet-header cache
1362 	 * or a non-packet-header cache.
1363 	 */
1364 	if (m->m_flags & M_PHCACHE) {
1365 		m->m_flags |= M_PKTHDR;
1366 		m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
1367 		m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
1368 		m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
1369 		SLIST_INIT(&m->m_pkthdr.tags);
1370 	}
1371 
1372 	/*
1373 	 * Handle remaining flags combinations.  M_CLCACHE tells us whether
1374 	 * the mbuf was originally allocated from a cluster cache or not,
1375 	 * and is totally separate from whether the mbuf is currently
1376 	 * associated with a cluster.
1377 	 */
1378 	switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
1379 	case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
1380 		/*
1381 		 * mbuf+cluster cache case.  The mbuf was allocated from the
1382 		 * combined mbuf_cluster cache and can be returned to the
1383 		 * cache if the cluster hasn't been shared.
1384 		 */
1385 		if (m_sharecount(m) == 1) {
1386 			/*
1387 			 * The cluster has not been shared, we can just
1388 			 * reset the data pointer and return the mbuf
1389 			 * to the cluster cache.  Note that the reference
1390 			 * count is left intact (it is still associated with
1391 			 * an mbuf).
1392 			 */
1393 			m->m_data = m->m_ext.ext_buf;
1394 			if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) {
1395 				if (m->m_flags & M_PHCACHE)
1396 					objcache_put(mbufphdrjcluster_cache, m);
1397 				else
1398 					objcache_put(mbufjcluster_cache, m);
1399 				--mbstat[mycpu->gd_cpuid].m_jclusters;
1400 			} else {
1401 				if (m->m_flags & M_PHCACHE)
1402 					objcache_put(mbufphdrcluster_cache, m);
1403 				else
1404 					objcache_put(mbufcluster_cache, m);
1405 				--mbstat[mycpu->gd_cpuid].m_clusters;
1406 			}
1407 		} else {
1408 			/*
1409 			 * Hell.  Someone else has a ref on this cluster,
1410 			 * we have to disconnect it which means we can't
1411 			 * put it back into the mbufcluster_cache, we
1412 			 * have to destroy the mbuf.
1413 			 *
1414 			 * Other mbuf references to the cluster will typically
1415 			 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
1416 			 *
1417 			 * XXX we could try to connect another cluster to
1418 			 * it.
1419 			 */
1420 			m->m_ext.ext_free(m->m_ext.ext_arg);
1421 			m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1422 			if (m->m_ext.ext_size == MCLBYTES) {
1423 				if (m->m_flags & M_PHCACHE)
1424 					objcache_dtor(mbufphdrcluster_cache, m);
1425 				else
1426 					objcache_dtor(mbufcluster_cache, m);
1427 			} else {
1428 				if (m->m_flags & M_PHCACHE)
1429 					objcache_dtor(mbufphdrjcluster_cache, m);
1430 				else
1431 					objcache_dtor(mbufjcluster_cache, m);
1432 			}
1433 		}
1434 		break;
1435 	case M_EXT | M_EXT_CLUSTER:
1436 	case M_EXT:
1437 		/*
1438 		 * Normal cluster association case, disconnect the cluster from
1439 		 * the mbuf.  The cluster may or may not be custom.
1440 		 */
1441 		m->m_ext.ext_free(m->m_ext.ext_arg);
1442 		m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1443 		/* fall through */
1444 	case 0:
1445 		/*
1446 		 * return the mbuf to the mbuf cache.
1447 		 */
1448 		if (m->m_flags & M_PHCACHE) {
1449 			m->m_data = m->m_pktdat;
1450 			objcache_put(mbufphdr_cache, m);
1451 		} else {
1452 			m->m_data = m->m_dat;
1453 			objcache_put(mbuf_cache, m);
1454 		}
1455 		--mbstat[mycpu->gd_cpuid].m_mbufs;
1456 		break;
1457 	default:
1458 		if (!panicstr)
1459 			panic("bad mbuf flags %p %08x", m, m->m_flags);
1460 		break;
1461 	}
1462 	return (n);
1463 }
1464 
1465 #ifdef MBUF_DEBUG
1466 
1467 void
1468 _m_freem(struct mbuf *m, const char *func)
1469 {
1470 	while (m)
1471 		m = _m_free(m, func);
1472 }
1473 
1474 #else
1475 
1476 void
1477 m_freem(struct mbuf *m)
1478 {
1479 	while (m)
1480 		m = m_free(m);
1481 }
1482 
1483 #endif
1484 
1485 void
1486 m_extadd(struct mbuf *m, caddr_t buf, u_int size,  void (*reff)(void *),
1487     void (*freef)(void *), void *arg)
1488 {
1489 	m->m_ext.ext_arg = arg;
1490 	m->m_ext.ext_buf = buf;
1491 	m->m_ext.ext_ref = reff;
1492 	m->m_ext.ext_free = freef;
1493 	m->m_ext.ext_size = size;
1494 	reff(arg);
1495 	m->m_data = buf;
1496 	m->m_flags |= M_EXT;
1497 }
1498 
1499 /*
1500  * mbuf utility routines
1501  */
1502 
1503 /*
1504  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
1505  * copy junk along.
1506  */
1507 struct mbuf *
1508 m_prepend(struct mbuf *m, int len, int how)
1509 {
1510 	struct mbuf *mn;
1511 
1512 	if (m->m_flags & M_PKTHDR)
1513 	    mn = m_gethdr(how, m->m_type);
1514 	else
1515 	    mn = m_get(how, m->m_type);
1516 	if (mn == NULL) {
1517 		m_freem(m);
1518 		return (NULL);
1519 	}
1520 	if (m->m_flags & M_PKTHDR)
1521 		M_MOVE_PKTHDR(mn, m);
1522 	mn->m_next = m;
1523 	m = mn;
1524 	if (len < MHLEN)
1525 		MH_ALIGN(m, len);
1526 	m->m_len = len;
1527 	return (m);
1528 }
1529 
1530 /*
1531  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1532  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1533  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
1534  * Note that the copy is read-only, because clusters are not copied,
1535  * only their reference counts are incremented.
1536  */
1537 struct mbuf *
1538 m_copym(const struct mbuf *m, int off0, int len, int wait)
1539 {
1540 	struct mbuf *n, **np;
1541 	int off = off0;
1542 	struct mbuf *top;
1543 	int copyhdr = 0;
1544 
1545 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1546 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1547 	if (off == 0 && (m->m_flags & M_PKTHDR))
1548 		copyhdr = 1;
1549 	while (off > 0) {
1550 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1551 		if (off < m->m_len)
1552 			break;
1553 		off -= m->m_len;
1554 		m = m->m_next;
1555 	}
1556 	np = &top;
1557 	top = NULL;
1558 	while (len > 0) {
1559 		if (m == NULL) {
1560 			KASSERT(len == M_COPYALL,
1561 			    ("m_copym, length > size of mbuf chain"));
1562 			break;
1563 		}
1564 		/*
1565 		 * Because we are sharing any cluster attachment below,
1566 		 * be sure to get an mbuf that does not have a cluster
1567 		 * associated with it.
1568 		 */
1569 		if (copyhdr)
1570 			n = m_gethdr(wait, m->m_type);
1571 		else
1572 			n = m_get(wait, m->m_type);
1573 		*np = n;
1574 		if (n == NULL)
1575 			goto nospace;
1576 		if (copyhdr) {
1577 			if (!m_dup_pkthdr(n, m, wait))
1578 				goto nospace;
1579 			if (len == M_COPYALL)
1580 				n->m_pkthdr.len -= off0;
1581 			else
1582 				n->m_pkthdr.len = len;
1583 			copyhdr = 0;
1584 		}
1585 		n->m_len = min(len, m->m_len - off);
1586 		if (m->m_flags & M_EXT) {
1587 			KKASSERT((n->m_flags & M_EXT) == 0);
1588 			n->m_data = m->m_data + off;
1589 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1590 			n->m_ext = m->m_ext;
1591 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1592 		} else {
1593 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1594 			    (unsigned)n->m_len);
1595 		}
1596 		if (len != M_COPYALL)
1597 			len -= n->m_len;
1598 		off = 0;
1599 		m = m->m_next;
1600 		np = &n->m_next;
1601 	}
1602 	if (top == NULL)
1603 		++mbstat[mycpu->gd_cpuid].m_mcfail;
1604 	return (top);
1605 nospace:
1606 	m_freem(top);
1607 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1608 	return (NULL);
1609 }
1610 
1611 /*
1612  * Copy an entire packet, including header (which must be present).
1613  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1614  * Note that the copy is read-only, because clusters are not copied,
1615  * only their reference counts are incremented.
1616  * Preserve alignment of the first mbuf so if the creator has left
1617  * some room at the beginning (e.g. for inserting protocol headers)
1618  * the copies also have the room available.
1619  */
1620 struct mbuf *
1621 m_copypacket(struct mbuf *m, int how)
1622 {
1623 	struct mbuf *top, *n, *o;
1624 
1625 	n = m_gethdr(how, m->m_type);
1626 	top = n;
1627 	if (!n)
1628 		goto nospace;
1629 
1630 	if (!m_dup_pkthdr(n, m, how))
1631 		goto nospace;
1632 	n->m_len = m->m_len;
1633 	if (m->m_flags & M_EXT) {
1634 		KKASSERT((n->m_flags & M_EXT) == 0);
1635 		n->m_data = m->m_data;
1636 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1637 		n->m_ext = m->m_ext;
1638 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1639 	} else {
1640 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1641 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1642 	}
1643 
1644 	m = m->m_next;
1645 	while (m) {
1646 		o = m_get(how, m->m_type);
1647 		if (!o)
1648 			goto nospace;
1649 
1650 		n->m_next = o;
1651 		n = n->m_next;
1652 
1653 		n->m_len = m->m_len;
1654 		if (m->m_flags & M_EXT) {
1655 			KKASSERT((n->m_flags & M_EXT) == 0);
1656 			n->m_data = m->m_data;
1657 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1658 			n->m_ext = m->m_ext;
1659 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1660 		} else {
1661 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1662 		}
1663 
1664 		m = m->m_next;
1665 	}
1666 	return top;
1667 nospace:
1668 	m_freem(top);
1669 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1670 	return (NULL);
1671 }
1672 
1673 /*
1674  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1675  * continuing for "len" bytes, into the indicated buffer.
1676  */
1677 void
1678 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1679 {
1680 	unsigned count;
1681 
1682 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1683 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1684 	while (off > 0) {
1685 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1686 		if (off < m->m_len)
1687 			break;
1688 		off -= m->m_len;
1689 		m = m->m_next;
1690 	}
1691 	while (len > 0) {
1692 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1693 		count = min(m->m_len - off, len);
1694 		bcopy(mtod(m, caddr_t) + off, cp, count);
1695 		len -= count;
1696 		cp += count;
1697 		off = 0;
1698 		m = m->m_next;
1699 	}
1700 }
1701 
1702 /*
1703  * Copy a packet header mbuf chain into a completely new chain, including
1704  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1705  * you need a writable copy of an mbuf chain.
1706  */
1707 struct mbuf *
1708 m_dup(struct mbuf *m, int how)
1709 {
1710 	struct mbuf **p, *top = NULL;
1711 	int remain, moff, nsize;
1712 
1713 	/* Sanity check */
1714 	if (m == NULL)
1715 		return (NULL);
1716 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1717 
1718 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1719 	remain = m->m_pkthdr.len;
1720 	moff = 0;
1721 	p = &top;
1722 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1723 		struct mbuf *n;
1724 
1725 		/* Get the next new mbuf */
1726 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1727 			   &nsize);
1728 		if (n == NULL)
1729 			goto nospace;
1730 		if (top == NULL)
1731 			if (!m_dup_pkthdr(n, m, how))
1732 				goto nospace0;
1733 
1734 		/* Link it into the new chain */
1735 		*p = n;
1736 		p = &n->m_next;
1737 
1738 		/* Copy data from original mbuf(s) into new mbuf */
1739 		n->m_len = 0;
1740 		while (n->m_len < nsize && m != NULL) {
1741 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1742 
1743 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1744 			moff += chunk;
1745 			n->m_len += chunk;
1746 			remain -= chunk;
1747 			if (moff == m->m_len) {
1748 				m = m->m_next;
1749 				moff = 0;
1750 			}
1751 		}
1752 
1753 		/* Check correct total mbuf length */
1754 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1755 			("%s: bogus m_pkthdr.len", __func__));
1756 	}
1757 	return (top);
1758 
1759 nospace:
1760 	m_freem(top);
1761 nospace0:
1762 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1763 	return (NULL);
1764 }
1765 
1766 /*
1767  * Copy the non-packet mbuf data chain into a new set of mbufs, including
1768  * copying any mbuf clusters.  This is typically used to realign a data
1769  * chain by nfs_realign().
1770  *
1771  * The original chain is left intact.  how should be M_WAITOK or M_NOWAIT
1772  * and NULL can be returned if M_NOWAIT is passed.
1773  *
1774  * Be careful to use cluster mbufs, a large mbuf chain converted to non
1775  * cluster mbufs can exhaust our supply of mbufs.
1776  */
1777 struct mbuf *
1778 m_dup_data(struct mbuf *m, int how)
1779 {
1780 	struct mbuf **p, *n, *top = NULL;
1781 	int mlen, moff, chunk, gsize, nsize;
1782 
1783 	/*
1784 	 * Degenerate case
1785 	 */
1786 	if (m == NULL)
1787 		return (NULL);
1788 
1789 	/*
1790 	 * Optimize the mbuf allocation but do not get too carried away.
1791 	 */
1792 	if (m->m_next || m->m_len > MLEN)
1793 		if (m->m_flags & M_EXT && m->m_ext.ext_size == MCLBYTES)
1794 			gsize = MCLBYTES;
1795 		else
1796 			gsize = MJUMPAGESIZE;
1797 	else
1798 		gsize = MLEN;
1799 
1800 	/* Chain control */
1801 	p = &top;
1802 	n = NULL;
1803 	nsize = 0;
1804 
1805 	/*
1806 	 * Scan the mbuf chain until nothing is left, the new mbuf chain
1807 	 * will be allocated on the fly as needed.
1808 	 */
1809 	while (m) {
1810 		mlen = m->m_len;
1811 		moff = 0;
1812 
1813 		while (mlen) {
1814 			KKASSERT(m->m_type == MT_DATA);
1815 			if (n == NULL) {
1816 				n = m_getl(gsize, how, MT_DATA, 0, &nsize);
1817 				n->m_len = 0;
1818 				if (n == NULL)
1819 					goto nospace;
1820 				*p = n;
1821 				p = &n->m_next;
1822 			}
1823 			chunk = imin(mlen, nsize);
1824 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1825 			mlen -= chunk;
1826 			moff += chunk;
1827 			n->m_len += chunk;
1828 			nsize -= chunk;
1829 			if (nsize == 0)
1830 				n = NULL;
1831 		}
1832 		m = m->m_next;
1833 	}
1834 	*p = NULL;
1835 	return(top);
1836 nospace:
1837 	*p = NULL;
1838 	m_freem(top);
1839 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1840 	return (NULL);
1841 }
1842 
1843 /*
1844  * Concatenate mbuf chain n to m.
1845  * Both chains must be of the same type (e.g. MT_DATA).
1846  * Any m_pkthdr is not updated.
1847  */
1848 void
1849 m_cat(struct mbuf *m, struct mbuf *n)
1850 {
1851 	m = m_last(m);
1852 	while (n) {
1853 		if (m->m_flags & M_EXT ||
1854 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1855 			/* just join the two chains */
1856 			m->m_next = n;
1857 			return;
1858 		}
1859 		/* splat the data from one into the other */
1860 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1861 		    (u_int)n->m_len);
1862 		m->m_len += n->m_len;
1863 		n = m_free(n);
1864 	}
1865 }
1866 
1867 void
1868 m_adj(struct mbuf *mp, int req_len)
1869 {
1870 	int len = req_len;
1871 	struct mbuf *m;
1872 	int count;
1873 
1874 	if ((m = mp) == NULL)
1875 		return;
1876 	if (len >= 0) {
1877 		/*
1878 		 * Trim from head.
1879 		 */
1880 		while (m != NULL && len > 0) {
1881 			if (m->m_len <= len) {
1882 				len -= m->m_len;
1883 				m->m_len = 0;
1884 				m = m->m_next;
1885 			} else {
1886 				m->m_len -= len;
1887 				m->m_data += len;
1888 				len = 0;
1889 			}
1890 		}
1891 		m = mp;
1892 		if (mp->m_flags & M_PKTHDR)
1893 			m->m_pkthdr.len -= (req_len - len);
1894 	} else {
1895 		/*
1896 		 * Trim from tail.  Scan the mbuf chain,
1897 		 * calculating its length and finding the last mbuf.
1898 		 * If the adjustment only affects this mbuf, then just
1899 		 * adjust and return.  Otherwise, rescan and truncate
1900 		 * after the remaining size.
1901 		 */
1902 		len = -len;
1903 		count = 0;
1904 		for (;;) {
1905 			count += m->m_len;
1906 			if (m->m_next == NULL)
1907 				break;
1908 			m = m->m_next;
1909 		}
1910 		if (m->m_len >= len) {
1911 			m->m_len -= len;
1912 			if (mp->m_flags & M_PKTHDR)
1913 				mp->m_pkthdr.len -= len;
1914 			return;
1915 		}
1916 		count -= len;
1917 		if (count < 0)
1918 			count = 0;
1919 		/*
1920 		 * Correct length for chain is "count".
1921 		 * Find the mbuf with last data, adjust its length,
1922 		 * and toss data from remaining mbufs on chain.
1923 		 */
1924 		m = mp;
1925 		if (m->m_flags & M_PKTHDR)
1926 			m->m_pkthdr.len = count;
1927 		for (; m; m = m->m_next) {
1928 			if (m->m_len >= count) {
1929 				m->m_len = count;
1930 				break;
1931 			}
1932 			count -= m->m_len;
1933 		}
1934 		while (m->m_next)
1935 			(m = m->m_next) ->m_len = 0;
1936 	}
1937 }
1938 
1939 /*
1940  * Set the m_data pointer of a newly-allocated mbuf
1941  * to place an object of the specified size at the
1942  * end of the mbuf, longword aligned.
1943  */
1944 void
1945 m_align(struct mbuf *m, int len)
1946 {
1947 	int adjust;
1948 
1949 	if (m->m_flags & M_EXT)
1950 		adjust = m->m_ext.ext_size - len;
1951 	else if (m->m_flags & M_PKTHDR)
1952 		adjust = MHLEN - len;
1953 	else
1954 		adjust = MLEN - len;
1955 	m->m_data += adjust &~ (sizeof(long)-1);
1956 }
1957 
1958 /*
1959  * Create a writable copy of the mbuf chain.  While doing this
1960  * we compact the chain with a goal of producing a chain with
1961  * at most two mbufs.  The second mbuf in this chain is likely
1962  * to be a cluster.  The primary purpose of this work is to create
1963  * a writable packet for encryption, compression, etc.  The
1964  * secondary goal is to linearize the data so the data can be
1965  * passed to crypto hardware in the most efficient manner possible.
1966  */
1967 struct mbuf *
1968 m_unshare(struct mbuf *m0, int how)
1969 {
1970 	struct mbuf *m, *mprev;
1971 	struct mbuf *n, *mfirst, *mlast;
1972 	int len, off;
1973 
1974 	mprev = NULL;
1975 	for (m = m0; m != NULL; m = mprev->m_next) {
1976 		/*
1977 		 * Regular mbufs are ignored unless there's a cluster
1978 		 * in front of it that we can use to coalesce.  We do
1979 		 * the latter mainly so later clusters can be coalesced
1980 		 * also w/o having to handle them specially (i.e. convert
1981 		 * mbuf+cluster -> cluster).  This optimization is heavily
1982 		 * influenced by the assumption that we're running over
1983 		 * Ethernet where MCLBYTES is large enough that the max
1984 		 * packet size will permit lots of coalescing into a
1985 		 * single cluster.  This in turn permits efficient
1986 		 * crypto operations, especially when using hardware.
1987 		 */
1988 		if ((m->m_flags & M_EXT) == 0) {
1989 			if (mprev && (mprev->m_flags & M_EXT) &&
1990 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
1991 				/* XXX: this ignores mbuf types */
1992 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1993 				       mtod(m, caddr_t), m->m_len);
1994 				mprev->m_len += m->m_len;
1995 				mprev->m_next = m->m_next;	/* unlink from chain */
1996 				m_free(m);			/* reclaim mbuf */
1997 			} else {
1998 				mprev = m;
1999 			}
2000 			continue;
2001 		}
2002 		/*
2003 		 * Writable mbufs are left alone (for now).
2004 		 */
2005 		if (M_WRITABLE(m)) {
2006 			mprev = m;
2007 			continue;
2008 		}
2009 
2010 		/*
2011 		 * Not writable, replace with a copy or coalesce with
2012 		 * the previous mbuf if possible (since we have to copy
2013 		 * it anyway, we try to reduce the number of mbufs and
2014 		 * clusters so that future work is easier).
2015 		 */
2016 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
2017 		/* NB: we only coalesce into a cluster or larger */
2018 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
2019 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
2020 			/* XXX: this ignores mbuf types */
2021 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2022 			       mtod(m, caddr_t), m->m_len);
2023 			mprev->m_len += m->m_len;
2024 			mprev->m_next = m->m_next;	/* unlink from chain */
2025 			m_free(m);			/* reclaim mbuf */
2026 			continue;
2027 		}
2028 
2029 		/*
2030 		 * Allocate new space to hold the copy...
2031 		 */
2032 		/* XXX why can M_PKTHDR be set past the first mbuf? */
2033 		if (mprev == NULL && (m->m_flags & M_PKTHDR)) {
2034 			/*
2035 			 * NB: if a packet header is present we must
2036 			 * allocate the mbuf separately from any cluster
2037 			 * because M_MOVE_PKTHDR will smash the data
2038 			 * pointer and drop the M_EXT marker.
2039 			 */
2040 			MGETHDR(n, how, m->m_type);
2041 			if (n == NULL) {
2042 				m_freem(m0);
2043 				return (NULL);
2044 			}
2045 			M_MOVE_PKTHDR(n, m);
2046 			MCLGET(n, how);
2047 			if ((n->m_flags & M_EXT) == 0) {
2048 				m_free(n);
2049 				m_freem(m0);
2050 				return (NULL);
2051 			}
2052 		} else {
2053 			n = m_getcl(how, m->m_type, m->m_flags);
2054 			if (n == NULL) {
2055 				m_freem(m0);
2056 				return (NULL);
2057 			}
2058 		}
2059 		/*
2060 		 * ... and copy the data.  We deal with jumbo mbufs
2061 		 * (i.e. m_len > MCLBYTES) by splitting them into
2062 		 * clusters.  We could just malloc a buffer and make
2063 		 * it external but too many device drivers don't know
2064 		 * how to break up the non-contiguous memory when
2065 		 * doing DMA.
2066 		 */
2067 		len = m->m_len;
2068 		off = 0;
2069 		mfirst = n;
2070 		mlast = NULL;
2071 		for (;;) {
2072 			int cc = min(len, MCLBYTES);
2073 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
2074 			n->m_len = cc;
2075 			if (mlast != NULL)
2076 				mlast->m_next = n;
2077 			mlast = n;
2078 
2079 			len -= cc;
2080 			if (len <= 0)
2081 				break;
2082 			off += cc;
2083 
2084 			n = m_getcl(how, m->m_type, m->m_flags);
2085 			if (n == NULL) {
2086 				m_freem(mfirst);
2087 				m_freem(m0);
2088 				return (NULL);
2089 			}
2090 		}
2091 		n->m_next = m->m_next;
2092 		if (mprev == NULL)
2093 			m0 = mfirst;		/* new head of chain */
2094 		else
2095 			mprev->m_next = mfirst;	/* replace old mbuf */
2096 		m_free(m);			/* release old mbuf */
2097 		mprev = mfirst;
2098 	}
2099 	return (m0);
2100 }
2101 
2102 /*
2103  * Rearrange an mbuf chain so that len bytes are contiguous
2104  * and in the data area of an mbuf (so that mtod will work for a structure
2105  * of size len).  Returns the resulting mbuf chain on success, frees it and
2106  * returns null on failure.  If there is room, it will add up to
2107  * max_protohdr-len extra bytes to the contiguous region in an attempt to
2108  * avoid being called next time.
2109  */
2110 struct mbuf *
2111 m_pullup(struct mbuf *n, int len)
2112 {
2113 	struct mbuf *m;
2114 	int count;
2115 	int space;
2116 
2117 	/*
2118 	 * If first mbuf has no cluster, and has room for len bytes
2119 	 * without shifting current data, pullup into it,
2120 	 * otherwise allocate a new mbuf to prepend to the chain.
2121 	 */
2122 	if (!(n->m_flags & M_EXT) &&
2123 	    n->m_data + len < &n->m_dat[MLEN] &&
2124 	    n->m_next) {
2125 		if (n->m_len >= len)
2126 			return (n);
2127 		m = n;
2128 		n = n->m_next;
2129 		len -= m->m_len;
2130 	} else {
2131 		if (len > MHLEN)
2132 			goto bad;
2133 		if (n->m_flags & M_PKTHDR)
2134 			m = m_gethdr(M_NOWAIT, n->m_type);
2135 		else
2136 			m = m_get(M_NOWAIT, n->m_type);
2137 		if (m == NULL)
2138 			goto bad;
2139 		m->m_len = 0;
2140 		if (n->m_flags & M_PKTHDR)
2141 			M_MOVE_PKTHDR(m, n);
2142 	}
2143 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
2144 	do {
2145 		count = min(min(max(len, max_protohdr), space), n->m_len);
2146 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
2147 		  (unsigned)count);
2148 		len -= count;
2149 		m->m_len += count;
2150 		n->m_len -= count;
2151 		space -= count;
2152 		if (n->m_len)
2153 			n->m_data += count;
2154 		else
2155 			n = m_free(n);
2156 	} while (len > 0 && n);
2157 	if (len > 0) {
2158 		m_free(m);
2159 		goto bad;
2160 	}
2161 	m->m_next = n;
2162 	return (m);
2163 bad:
2164 	m_freem(n);
2165 	++mbstat[mycpu->gd_cpuid].m_mcfail;
2166 	return (NULL);
2167 }
2168 
2169 /*
2170  * Partition an mbuf chain in two pieces, returning the tail --
2171  * all but the first len0 bytes.  In case of failure, it returns NULL and
2172  * attempts to restore the chain to its original state.
2173  *
2174  * Note that the resulting mbufs might be read-only, because the new
2175  * mbuf can end up sharing an mbuf cluster with the original mbuf if
2176  * the "breaking point" happens to lie within a cluster mbuf. Use the
2177  * M_WRITABLE() macro to check for this case.
2178  */
2179 struct mbuf *
2180 m_split(struct mbuf *m0, int len0, int wait)
2181 {
2182 	struct mbuf *m, *n;
2183 	unsigned len = len0, remain;
2184 
2185 	for (m = m0; m && len > m->m_len; m = m->m_next)
2186 		len -= m->m_len;
2187 	if (m == NULL)
2188 		return (NULL);
2189 	remain = m->m_len - len;
2190 	if (m0->m_flags & M_PKTHDR) {
2191 		n = m_gethdr(wait, m0->m_type);
2192 		if (n == NULL)
2193 			return (NULL);
2194 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
2195 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
2196 		m0->m_pkthdr.len = len0;
2197 		if (m->m_flags & M_EXT)
2198 			goto extpacket;
2199 		if (remain > MHLEN) {
2200 			/* m can't be the lead packet */
2201 			MH_ALIGN(n, 0);
2202 			n->m_next = m_split(m, len, wait);
2203 			if (n->m_next == NULL) {
2204 				m_free(n);
2205 				return (NULL);
2206 			} else {
2207 				n->m_len = 0;
2208 				return (n);
2209 			}
2210 		} else
2211 			MH_ALIGN(n, remain);
2212 	} else if (remain == 0) {
2213 		n = m->m_next;
2214 		m->m_next = NULL;
2215 		return (n);
2216 	} else {
2217 		n = m_get(wait, m->m_type);
2218 		if (n == NULL)
2219 			return (NULL);
2220 		M_ALIGN(n, remain);
2221 	}
2222 extpacket:
2223 	if (m->m_flags & M_EXT) {
2224 		KKASSERT((n->m_flags & M_EXT) == 0);
2225 		n->m_data = m->m_data + len;
2226 		m->m_ext.ext_ref(m->m_ext.ext_arg);
2227 		n->m_ext = m->m_ext;
2228 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
2229 	} else {
2230 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
2231 	}
2232 	n->m_len = remain;
2233 	m->m_len = len;
2234 	n->m_next = m->m_next;
2235 	m->m_next = NULL;
2236 	return (n);
2237 }
2238 
2239 /*
2240  * Routine to copy from device local memory into mbufs.
2241  * Note: "offset" is ill-defined and always called as 0, so ignore it.
2242  */
2243 struct mbuf *
2244 m_devget(char *buf, int len, int offset, struct ifnet *ifp)
2245 {
2246 	struct mbuf *m, *mfirst = NULL, **mtail;
2247 	int nsize, flags;
2248 
2249 	mtail = &mfirst;
2250 	flags = M_PKTHDR;
2251 
2252 	while (len > 0) {
2253 		m = m_getl(len, M_NOWAIT, MT_DATA, flags, &nsize);
2254 		if (m == NULL) {
2255 			m_freem(mfirst);
2256 			return (NULL);
2257 		}
2258 		m->m_len = min(len, nsize);
2259 
2260 		if (flags & M_PKTHDR) {
2261 			if (len + max_linkhdr <= nsize)
2262 				m->m_data += max_linkhdr;
2263 			m->m_pkthdr.rcvif = ifp;
2264 			m->m_pkthdr.len = len;
2265 			flags = 0;
2266 		}
2267 
2268 		bcopy(buf, m->m_data, (unsigned)m->m_len);
2269 		buf += m->m_len;
2270 		len -= m->m_len;
2271 		*mtail = m;
2272 		mtail = &m->m_next;
2273 	}
2274 
2275 	return (mfirst);
2276 }
2277 
2278 /*
2279  * Routine to pad mbuf to the specified length 'padto'.
2280  */
2281 int
2282 m_devpad(struct mbuf *m, int padto)
2283 {
2284 	struct mbuf *last = NULL;
2285 	int padlen;
2286 
2287 	if (padto <= m->m_pkthdr.len)
2288 		return 0;
2289 
2290 	padlen = padto - m->m_pkthdr.len;
2291 
2292 	/* if there's only the packet-header and we can pad there, use it. */
2293 	if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) {
2294 		last = m;
2295 	} else {
2296 		/*
2297 		 * Walk packet chain to find last mbuf. We will either
2298 		 * pad there, or append a new mbuf and pad it
2299 		 */
2300 		for (last = m; last->m_next != NULL; last = last->m_next)
2301 			; /* EMPTY */
2302 
2303 		/* `last' now points to last in chain. */
2304 		if (M_TRAILINGSPACE(last) < padlen) {
2305 			struct mbuf *n;
2306 
2307 			/* Allocate new empty mbuf, pad it.  Compact later. */
2308 			MGET(n, M_NOWAIT, MT_DATA);
2309 			if (n == NULL)
2310 				return ENOBUFS;
2311 			n->m_len = 0;
2312 			last->m_next = n;
2313 			last = n;
2314 		}
2315 	}
2316 	KKASSERT(M_TRAILINGSPACE(last) >= padlen);
2317 	KKASSERT(M_WRITABLE(last));
2318 
2319 	/* Now zero the pad area */
2320 	bzero(mtod(last, char *) + last->m_len, padlen);
2321 	last->m_len += padlen;
2322 	m->m_pkthdr.len += padlen;
2323 	return 0;
2324 }
2325 
2326 /*
2327  * Copy data from a buffer back into the indicated mbuf chain,
2328  * starting "off" bytes from the beginning, extending the mbuf
2329  * chain if necessary.
2330  */
2331 void
2332 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
2333 {
2334 	int mlen;
2335 	struct mbuf *m = m0, *n;
2336 	int totlen = 0;
2337 
2338 	if (m0 == NULL)
2339 		return;
2340 	while (off > (mlen = m->m_len)) {
2341 		off -= mlen;
2342 		totlen += mlen;
2343 		if (m->m_next == NULL) {
2344 			n = m_getclr(M_NOWAIT, m->m_type);
2345 			if (n == NULL)
2346 				goto out;
2347 			n->m_len = min(MLEN, len + off);
2348 			m->m_next = n;
2349 		}
2350 		m = m->m_next;
2351 	}
2352 	while (len > 0) {
2353 		mlen = min (m->m_len - off, len);
2354 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
2355 		cp += mlen;
2356 		len -= mlen;
2357 		mlen += off;
2358 		off = 0;
2359 		totlen += mlen;
2360 		if (len == 0)
2361 			break;
2362 		if (m->m_next == NULL) {
2363 			n = m_get(M_NOWAIT, m->m_type);
2364 			if (n == NULL)
2365 				break;
2366 			n->m_len = min(MLEN, len);
2367 			m->m_next = n;
2368 		}
2369 		m = m->m_next;
2370 	}
2371 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
2372 		m->m_pkthdr.len = totlen;
2373 }
2374 
2375 /*
2376  * Append the specified data to the indicated mbuf chain,
2377  * Extend the mbuf chain if the new data does not fit in
2378  * existing space.
2379  *
2380  * Return 1 if able to complete the job; otherwise 0.
2381  */
2382 int
2383 m_append(struct mbuf *m0, int len, c_caddr_t cp)
2384 {
2385 	struct mbuf *m, *n;
2386 	int remainder, space;
2387 
2388 	for (m = m0; m->m_next != NULL; m = m->m_next)
2389 		;
2390 	remainder = len;
2391 	space = M_TRAILINGSPACE(m);
2392 	if (space > 0) {
2393 		/*
2394 		 * Copy into available space.
2395 		 */
2396 		if (space > remainder)
2397 			space = remainder;
2398 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2399 		m->m_len += space;
2400 		cp += space, remainder -= space;
2401 	}
2402 	while (remainder > 0) {
2403 		/*
2404 		 * Allocate a new mbuf; could check space
2405 		 * and allocate a cluster instead.
2406 		 */
2407 		n = m_get(M_NOWAIT, m->m_type);
2408 		if (n == NULL)
2409 			break;
2410 		n->m_len = min(MLEN, remainder);
2411 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2412 		cp += n->m_len, remainder -= n->m_len;
2413 		m->m_next = n;
2414 		m = n;
2415 	}
2416 	if (m0->m_flags & M_PKTHDR)
2417 		m0->m_pkthdr.len += len - remainder;
2418 	return (remainder == 0);
2419 }
2420 
2421 /*
2422  * Apply function f to the data in an mbuf chain starting "off" bytes from
2423  * the beginning, continuing for "len" bytes.
2424  */
2425 int
2426 m_apply(struct mbuf *m, int off, int len,
2427     int (*f)(void *, void *, u_int), void *arg)
2428 {
2429 	u_int count;
2430 	int rval;
2431 
2432 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
2433 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
2434 	while (off > 0) {
2435 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2436 		if (off < m->m_len)
2437 			break;
2438 		off -= m->m_len;
2439 		m = m->m_next;
2440 	}
2441 	while (len > 0) {
2442 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2443 		count = min(m->m_len - off, len);
2444 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
2445 		if (rval)
2446 			return (rval);
2447 		len -= count;
2448 		off = 0;
2449 		m = m->m_next;
2450 	}
2451 	return (0);
2452 }
2453 
2454 /*
2455  * Return a pointer to mbuf/offset of location in mbuf chain.
2456  */
2457 struct mbuf *
2458 m_getptr(struct mbuf *m, int loc, int *off)
2459 {
2460 
2461 	while (loc >= 0) {
2462 		/* Normal end of search. */
2463 		if (m->m_len > loc) {
2464 			*off = loc;
2465 			return (m);
2466 		} else {
2467 			loc -= m->m_len;
2468 			if (m->m_next == NULL) {
2469 				if (loc == 0) {
2470 					/* Point at the end of valid data. */
2471 					*off = m->m_len;
2472 					return (m);
2473 				}
2474 				return (NULL);
2475 			}
2476 			m = m->m_next;
2477 		}
2478 	}
2479 	return (NULL);
2480 }
2481 
2482 void
2483 m_print(const struct mbuf *m)
2484 {
2485 	int len;
2486 	const struct mbuf *m2;
2487 	char *hexstr;
2488 
2489 	len = m->m_pkthdr.len;
2490 	m2 = m;
2491 	hexstr = kmalloc(HEX_NCPYLEN(len), M_TEMP, M_ZERO | M_WAITOK);
2492 	while (len) {
2493 		kprintf("%p %s\n", m2, hexncpy(m2->m_data, m2->m_len, hexstr,
2494 			HEX_NCPYLEN(m2->m_len), "-"));
2495 		len -= m2->m_len;
2496 		m2 = m2->m_next;
2497 	}
2498 	kfree(hexstr, M_TEMP);
2499 	return;
2500 }
2501 
2502 /*
2503  * "Move" mbuf pkthdr from "from" to "to".
2504  * "from" must have M_PKTHDR set, and "to" must be empty.
2505  */
2506 void
2507 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
2508 {
2509 	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
2510 
2511 	to->m_flags |= from->m_flags & M_COPYFLAGS;
2512 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
2513 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
2514 }
2515 
2516 /*
2517  * Duplicate "from"'s mbuf pkthdr in "to".
2518  * "from" must have M_PKTHDR set, and "to" must be empty.
2519  * In particular, this does a deep copy of the packet tags.
2520  */
2521 int
2522 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
2523 {
2524 	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
2525 
2526 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
2527 		      (to->m_flags & ~M_COPYFLAGS);
2528 	to->m_pkthdr = from->m_pkthdr;
2529 	SLIST_INIT(&to->m_pkthdr.tags);
2530 	return (m_tag_copy_chain(to, from, how));
2531 }
2532 
2533 /*
2534  * Defragment a mbuf chain, returning the shortest possible
2535  * chain of mbufs and clusters.  If allocation fails and
2536  * this cannot be completed, NULL will be returned, but
2537  * the passed in chain will be unchanged.  Upon success,
2538  * the original chain will be freed, and the new chain
2539  * will be returned.
2540  *
2541  * If a non-packet header is passed in, the original
2542  * mbuf (chain?) will be returned unharmed.
2543  *
2544  * m_defrag_nofree doesn't free the passed in mbuf.
2545  */
2546 struct mbuf *
2547 m_defrag(struct mbuf *m0, int how)
2548 {
2549 	struct mbuf *m_new;
2550 
2551 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
2552 		return (NULL);
2553 	if (m_new != m0)
2554 		m_freem(m0);
2555 	return (m_new);
2556 }
2557 
2558 struct mbuf *
2559 m_defrag_nofree(struct mbuf *m0, int how)
2560 {
2561 	struct mbuf	*m_new = NULL, *m_final = NULL;
2562 	int		progress = 0, length, nsize;
2563 
2564 	if (!(m0->m_flags & M_PKTHDR))
2565 		return (m0);
2566 
2567 #ifdef MBUF_STRESS_TEST
2568 	if (m_defragrandomfailures) {
2569 		int temp = karc4random() & 0xff;
2570 		if (temp == 0xba)
2571 			goto nospace;
2572 	}
2573 #endif
2574 
2575 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
2576 	if (m_final == NULL)
2577 		goto nospace;
2578 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
2579 
2580 	if (m_dup_pkthdr(m_final, m0, how) == 0)
2581 		goto nospace;
2582 
2583 	m_new = m_final;
2584 
2585 	while (progress < m0->m_pkthdr.len) {
2586 		length = m0->m_pkthdr.len - progress;
2587 		if (length > MCLBYTES)
2588 			length = MCLBYTES;
2589 
2590 		if (m_new == NULL) {
2591 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
2592 			if (m_new == NULL)
2593 				goto nospace;
2594 		}
2595 
2596 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
2597 		progress += length;
2598 		m_new->m_len = length;
2599 		if (m_new != m_final)
2600 			m_cat(m_final, m_new);
2601 		m_new = NULL;
2602 	}
2603 	if (m0->m_next == NULL)
2604 		m_defraguseless++;
2605 	m_defragpackets++;
2606 	m_defragbytes += m_final->m_pkthdr.len;
2607 	return (m_final);
2608 nospace:
2609 	m_defragfailure++;
2610 	if (m_new)
2611 		m_free(m_new);
2612 	m_freem(m_final);
2613 	return (NULL);
2614 }
2615 
2616 /*
2617  * Move data from uio into mbufs.
2618  */
2619 struct mbuf *
2620 m_uiomove(struct uio *uio)
2621 {
2622 	struct mbuf *m;			/* current working mbuf */
2623 	struct mbuf *head = NULL;	/* result mbuf chain */
2624 	struct mbuf **mp = &head;
2625 	int flags = M_PKTHDR;
2626 	int nsize;
2627 	int error;
2628 	int resid;
2629 
2630 	do {
2631 		if (uio->uio_resid > INT_MAX)
2632 			resid = INT_MAX;
2633 		else
2634 			resid = (int)uio->uio_resid;
2635 		m = m_getl(resid, M_WAITOK, MT_DATA, flags, &nsize);
2636 		if (flags) {
2637 			m->m_pkthdr.len = 0;
2638 			/* Leave room for protocol headers. */
2639 			if (resid < MHLEN)
2640 				MH_ALIGN(m, resid);
2641 			flags = 0;
2642 		}
2643 		m->m_len = imin(nsize, resid);
2644 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
2645 		if (error) {
2646 			m_free(m);
2647 			goto failed;
2648 		}
2649 		*mp = m;
2650 		mp = &m->m_next;
2651 		head->m_pkthdr.len += m->m_len;
2652 	} while (uio->uio_resid > 0);
2653 
2654 	return (head);
2655 
2656 failed:
2657 	m_freem(head);
2658 	return (NULL);
2659 }
2660 
2661 struct mbuf *
2662 m_last(struct mbuf *m)
2663 {
2664 	while (m->m_next)
2665 		m = m->m_next;
2666 	return (m);
2667 }
2668 
2669 /*
2670  * Return the number of bytes in an mbuf chain.
2671  * If lastm is not NULL, also return the last mbuf.
2672  */
2673 u_int
2674 m_lengthm(struct mbuf *m, struct mbuf **lastm)
2675 {
2676 	u_int len = 0;
2677 	struct mbuf *prev = m;
2678 
2679 	while (m) {
2680 		len += m->m_len;
2681 		prev = m;
2682 		m = m->m_next;
2683 	}
2684 	if (lastm != NULL)
2685 		*lastm = prev;
2686 	return (len);
2687 }
2688 
2689 /*
2690  * Like m_lengthm(), except also keep track of mbuf usage.
2691  */
2692 u_int
2693 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
2694 {
2695 	u_int len = 0, mbcnt = 0;
2696 	struct mbuf *prev = m;
2697 
2698 	while (m) {
2699 		len += m->m_len;
2700 		mbcnt += MSIZE;
2701 		if (m->m_flags & M_EXT)
2702 			mbcnt += m->m_ext.ext_size;
2703 		prev = m;
2704 		m = m->m_next;
2705 	}
2706 	if (lastm != NULL)
2707 		*lastm = prev;
2708 	*pmbcnt = mbcnt;
2709 	return (len);
2710 }
2711