xref: /dragonfly/sys/kern/uipc_mbuf.c (revision 81c11cd3)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
5  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Jeffrey M. Hsu.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * Copyright (c) 1982, 1986, 1988, 1991, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed by the University of
51  *	California, Berkeley and its contributors.
52  * 4. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  *
68  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
69  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
70  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.70 2008/11/20 14:21:01 sephe Exp $
71  */
72 
73 #include "opt_param.h"
74 #include "opt_mbuf_stress_test.h"
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/file.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/kernel.h>
81 #include <sys/sysctl.h>
82 #include <sys/domain.h>
83 #include <sys/objcache.h>
84 #include <sys/tree.h>
85 #include <sys/protosw.h>
86 #include <sys/uio.h>
87 #include <sys/thread.h>
88 #include <sys/globaldata.h>
89 
90 #include <sys/thread2.h>
91 #include <sys/spinlock2.h>
92 
93 #include <machine/atomic.h>
94 #include <machine/limits.h>
95 
96 #include <vm/vm.h>
97 #include <vm/vm_kern.h>
98 #include <vm/vm_extern.h>
99 
100 #ifdef INVARIANTS
101 #include <machine/cpu.h>
102 #endif
103 
104 /*
105  * mbuf cluster meta-data
106  */
107 struct mbcluster {
108 	int32_t	mcl_refs;
109 	void	*mcl_data;
110 };
111 
112 /*
113  * mbuf tracking for debugging purposes
114  */
115 #ifdef MBUF_DEBUG
116 
117 static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack");
118 
119 struct mbctrack;
120 RB_HEAD(mbuf_rb_tree, mbtrack);
121 RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *);
122 
123 struct mbtrack {
124 	RB_ENTRY(mbtrack) rb_node;
125 	int trackid;
126 	struct mbuf *m;
127 };
128 
129 static int
130 mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2)
131 {
132 	if (mb1->m < mb2->m)
133 		return(-1);
134 	if (mb1->m > mb2->m)
135 		return(1);
136 	return(0);
137 }
138 
139 RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m);
140 
141 struct mbuf_rb_tree	mbuf_track_root;
142 static struct spinlock	mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin);
143 
144 static void
145 mbuftrack(struct mbuf *m)
146 {
147 	struct mbtrack *mbt;
148 
149 	mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO);
150 	spin_lock(&mbuf_track_spin);
151 	mbt->m = m;
152 	if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) {
153 		spin_unlock(&mbuf_track_spin);
154 		panic("mbuftrack: mbuf %p already being tracked\n", m);
155 	}
156 	spin_unlock(&mbuf_track_spin);
157 }
158 
159 static void
160 mbufuntrack(struct mbuf *m)
161 {
162 	struct mbtrack *mbt;
163 
164 	spin_lock(&mbuf_track_spin);
165 	mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
166 	if (mbt == NULL) {
167 		spin_unlock(&mbuf_track_spin);
168 		panic("mbufuntrack: mbuf %p was not tracked\n", m);
169 	} else {
170 		mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt);
171 		spin_unlock(&mbuf_track_spin);
172 		kfree(mbt, M_MTRACK);
173 	}
174 }
175 
176 void
177 mbuftrackid(struct mbuf *m, int trackid)
178 {
179 	struct mbtrack *mbt;
180 	struct mbuf *n;
181 
182 	spin_lock(&mbuf_track_spin);
183 	while (m) {
184 		n = m->m_nextpkt;
185 		while (m) {
186 			mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
187 			if (mbt == NULL) {
188 				spin_unlock(&mbuf_track_spin);
189 				panic("mbuftrackid: mbuf %p not tracked", m);
190 			}
191 			mbt->trackid = trackid;
192 			m = m->m_next;
193 		}
194 		m = n;
195 	}
196 	spin_unlock(&mbuf_track_spin);
197 }
198 
199 static int
200 mbuftrack_callback(struct mbtrack *mbt, void *arg)
201 {
202 	struct sysctl_req *req = arg;
203 	char buf[64];
204 	int error;
205 
206 	ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid);
207 
208 	spin_unlock(&mbuf_track_spin);
209 	error = SYSCTL_OUT(req, buf, strlen(buf));
210 	spin_lock(&mbuf_track_spin);
211 	if (error)
212 		return(-error);
213 	return(0);
214 }
215 
216 static int
217 mbuftrack_show(SYSCTL_HANDLER_ARGS)
218 {
219 	int error;
220 
221 	spin_lock(&mbuf_track_spin);
222 	error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL,
223 				     mbuftrack_callback, req);
224 	spin_unlock(&mbuf_track_spin);
225 	return (-error);
226 }
227 SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING,
228 	    0, 0, mbuftrack_show, "A", "Show all in-use mbufs");
229 
230 #else
231 
232 #define mbuftrack(m)
233 #define mbufuntrack(m)
234 
235 #endif
236 
237 static void mbinit(void *);
238 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
239 
240 static u_long	mbtypes[SMP_MAXCPU][MT_NTYPES];
241 
242 static struct mbstat mbstat[SMP_MAXCPU];
243 int	max_linkhdr;
244 int	max_protohdr;
245 int	max_hdr;
246 int	max_datalen;
247 int	m_defragpackets;
248 int	m_defragbytes;
249 int	m_defraguseless;
250 int	m_defragfailure;
251 #ifdef MBUF_STRESS_TEST
252 int	m_defragrandomfailures;
253 #endif
254 
255 struct objcache *mbuf_cache, *mbufphdr_cache;
256 struct objcache *mclmeta_cache;
257 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
258 
259 int	nmbclusters;
260 int	nmbufs;
261 
262 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
263 	&max_linkhdr, 0, "Max size of a link-level header");
264 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
265 	&max_protohdr, 0, "Max size of a protocol header");
266 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0,
267 	"Max size of link+protocol headers");
268 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
269 	&max_datalen, 0, "Max data payload size without headers");
270 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
271 	&mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations");
272 static int do_mbstat(SYSCTL_HANDLER_ARGS);
273 
274 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD,
275 	0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics");
276 
277 static int do_mbtypes(SYSCTL_HANDLER_ARGS);
278 
279 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD,
280 	0, 0, do_mbtypes, "LU", "");
281 
282 static int
283 do_mbstat(SYSCTL_HANDLER_ARGS)
284 {
285 	struct mbstat mbstat_total;
286 	struct mbstat *mbstat_totalp;
287 	int i;
288 
289 	bzero(&mbstat_total, sizeof(mbstat_total));
290 	mbstat_totalp = &mbstat_total;
291 
292 	for (i = 0; i < ncpus; i++)
293 	{
294 		mbstat_total.m_mbufs += mbstat[i].m_mbufs;
295 		mbstat_total.m_clusters += mbstat[i].m_clusters;
296 		mbstat_total.m_spare += mbstat[i].m_spare;
297 		mbstat_total.m_clfree += mbstat[i].m_clfree;
298 		mbstat_total.m_drops += mbstat[i].m_drops;
299 		mbstat_total.m_wait += mbstat[i].m_wait;
300 		mbstat_total.m_drain += mbstat[i].m_drain;
301 		mbstat_total.m_mcfail += mbstat[i].m_mcfail;
302 		mbstat_total.m_mpfail += mbstat[i].m_mpfail;
303 
304 	}
305 	/*
306 	 * The following fields are not cumulative fields so just
307 	 * get their values once.
308 	 */
309 	mbstat_total.m_msize = mbstat[0].m_msize;
310 	mbstat_total.m_mclbytes = mbstat[0].m_mclbytes;
311 	mbstat_total.m_minclsize = mbstat[0].m_minclsize;
312 	mbstat_total.m_mlen = mbstat[0].m_mlen;
313 	mbstat_total.m_mhlen = mbstat[0].m_mhlen;
314 
315 	return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req));
316 }
317 
318 static int
319 do_mbtypes(SYSCTL_HANDLER_ARGS)
320 {
321 	u_long totals[MT_NTYPES];
322 	int i, j;
323 
324 	for (i = 0; i < MT_NTYPES; i++)
325 		totals[i] = 0;
326 
327 	for (i = 0; i < ncpus; i++)
328 	{
329 		for (j = 0; j < MT_NTYPES; j++)
330 			totals[j] += mbtypes[i][j];
331 	}
332 
333 	return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req));
334 }
335 
336 /*
337  * These are read-only because we do not currently have any code
338  * to adjust the objcache limits after the fact.  The variables
339  * may only be set as boot-time tunables.
340  */
341 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
342 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
343 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
344 	   "Maximum number of mbufs available");
345 
346 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
347 	   &m_defragpackets, 0, "Number of defragment packets");
348 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
349 	   &m_defragbytes, 0, "Number of defragment bytes");
350 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
351 	   &m_defraguseless, 0, "Number of useless defragment mbuf chain operations");
352 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
353 	   &m_defragfailure, 0, "Number of failed defragment mbuf chain operations");
354 #ifdef MBUF_STRESS_TEST
355 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
356 	   &m_defragrandomfailures, 0, "");
357 #endif
358 
359 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
360 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
361 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
362 
363 static void m_reclaim (void);
364 static void m_mclref(void *arg);
365 static void m_mclfree(void *arg);
366 
367 /*
368  * NOTE: Default NMBUFS must take into account a possible DOS attack
369  *	 using fd passing on unix domain sockets.
370  */
371 #ifndef NMBCLUSTERS
372 #define NMBCLUSTERS	(512 + maxusers * 16)
373 #endif
374 #ifndef NMBUFS
375 #define NMBUFS		(nmbclusters * 2 + maxfiles)
376 #endif
377 
378 /*
379  * Perform sanity checks of tunables declared above.
380  */
381 static void
382 tunable_mbinit(void *dummy)
383 {
384 	/*
385 	 * This has to be done before VM init.
386 	 */
387 	nmbclusters = NMBCLUSTERS;
388 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
389 	nmbufs = NMBUFS;
390 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
391 	/* Sanity checks */
392 	if (nmbufs < nmbclusters * 2)
393 		nmbufs = nmbclusters * 2;
394 }
395 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
396 	tunable_mbinit, NULL);
397 
398 /* "number of clusters of pages" */
399 #define NCL_INIT	1
400 
401 #define NMB_INIT	16
402 
403 /*
404  * The mbuf object cache only guarantees that m_next and m_nextpkt are
405  * NULL and that m_data points to the beginning of the data area.  In
406  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
407  * responsibility of the caller to initialize those fields before use.
408  */
409 
410 static boolean_t __inline
411 mbuf_ctor(void *obj, void *private, int ocflags)
412 {
413 	struct mbuf *m = obj;
414 
415 	m->m_next = NULL;
416 	m->m_nextpkt = NULL;
417 	m->m_data = m->m_dat;
418 	m->m_flags = 0;
419 
420 	return (TRUE);
421 }
422 
423 /*
424  * Initialize the mbuf and the packet header fields.
425  */
426 static boolean_t
427 mbufphdr_ctor(void *obj, void *private, int ocflags)
428 {
429 	struct mbuf *m = obj;
430 
431 	m->m_next = NULL;
432 	m->m_nextpkt = NULL;
433 	m->m_data = m->m_pktdat;
434 	m->m_flags = M_PKTHDR | M_PHCACHE;
435 
436 	m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
437 	SLIST_INIT(&m->m_pkthdr.tags);
438 	m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
439 	m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
440 
441 	return (TRUE);
442 }
443 
444 /*
445  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
446  */
447 static boolean_t
448 mclmeta_ctor(void *obj, void *private, int ocflags)
449 {
450 	struct mbcluster *cl = obj;
451 	void *buf;
452 
453 	if (ocflags & M_NOWAIT)
454 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
455 	else
456 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
457 	if (buf == NULL)
458 		return (FALSE);
459 	cl->mcl_refs = 0;
460 	cl->mcl_data = buf;
461 	return (TRUE);
462 }
463 
464 static void
465 mclmeta_dtor(void *obj, void *private)
466 {
467 	struct mbcluster *mcl = obj;
468 
469 	KKASSERT(mcl->mcl_refs == 0);
470 	kfree(mcl->mcl_data, M_MBUFCL);
471 }
472 
473 static void
474 linkcluster(struct mbuf *m, struct mbcluster *cl)
475 {
476 	/*
477 	 * Add the cluster to the mbuf.  The caller will detect that the
478 	 * mbuf now has an attached cluster.
479 	 */
480 	m->m_ext.ext_arg = cl;
481 	m->m_ext.ext_buf = cl->mcl_data;
482 	m->m_ext.ext_ref = m_mclref;
483 	m->m_ext.ext_free = m_mclfree;
484 	m->m_ext.ext_size = MCLBYTES;
485 	atomic_add_int(&cl->mcl_refs, 1);
486 
487 	m->m_data = m->m_ext.ext_buf;
488 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
489 }
490 
491 static boolean_t
492 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
493 {
494 	struct mbuf *m = obj;
495 	struct mbcluster *cl;
496 
497 	mbufphdr_ctor(obj, private, ocflags);
498 	cl = objcache_get(mclmeta_cache, ocflags);
499 	if (cl == NULL) {
500 		++mbstat[mycpu->gd_cpuid].m_drops;
501 		return (FALSE);
502 	}
503 	m->m_flags |= M_CLCACHE;
504 	linkcluster(m, cl);
505 	return (TRUE);
506 }
507 
508 static boolean_t
509 mbufcluster_ctor(void *obj, void *private, int ocflags)
510 {
511 	struct mbuf *m = obj;
512 	struct mbcluster *cl;
513 
514 	mbuf_ctor(obj, private, ocflags);
515 	cl = objcache_get(mclmeta_cache, ocflags);
516 	if (cl == NULL) {
517 		++mbstat[mycpu->gd_cpuid].m_drops;
518 		return (FALSE);
519 	}
520 	m->m_flags |= M_CLCACHE;
521 	linkcluster(m, cl);
522 	return (TRUE);
523 }
524 
525 /*
526  * Used for both the cluster and cluster PHDR caches.
527  *
528  * The mbuf may have lost its cluster due to sharing, deal
529  * with the situation by checking M_EXT.
530  */
531 static void
532 mbufcluster_dtor(void *obj, void *private)
533 {
534 	struct mbuf *m = obj;
535 	struct mbcluster *mcl;
536 
537 	if (m->m_flags & M_EXT) {
538 		KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
539 		mcl = m->m_ext.ext_arg;
540 		KKASSERT(mcl->mcl_refs == 1);
541 		mcl->mcl_refs = 0;
542 		objcache_put(mclmeta_cache, mcl);
543 	}
544 }
545 
546 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
547 struct objcache_malloc_args mclmeta_malloc_args =
548 	{ sizeof(struct mbcluster), M_MCLMETA };
549 
550 /* ARGSUSED*/
551 static void
552 mbinit(void *dummy)
553 {
554 	int mb_limit, cl_limit;
555 	int limit;
556 	int i;
557 
558 	/*
559 	 * Initialize statistics
560 	 */
561 	for (i = 0; i < ncpus; i++) {
562 		atomic_set_long_nonlocked(&mbstat[i].m_msize, MSIZE);
563 		atomic_set_long_nonlocked(&mbstat[i].m_mclbytes, MCLBYTES);
564 		atomic_set_long_nonlocked(&mbstat[i].m_minclsize, MINCLSIZE);
565 		atomic_set_long_nonlocked(&mbstat[i].m_mlen, MLEN);
566 		atomic_set_long_nonlocked(&mbstat[i].m_mhlen, MHLEN);
567 	}
568 
569 	/*
570 	 * Create objtect caches and save cluster limits, which will
571 	 * be used to adjust backing kmalloc pools' limit later.
572 	 */
573 
574 	mb_limit = cl_limit = 0;
575 
576 	limit = nmbufs;
577 	mbuf_cache = objcache_create("mbuf", &limit, 0,
578 	    mbuf_ctor, NULL, NULL,
579 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
580 	mb_limit += limit;
581 
582 	limit = nmbufs;
583 	mbufphdr_cache = objcache_create("mbuf pkt hdr", &limit, 64,
584 	    mbufphdr_ctor, NULL, NULL,
585 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
586 	mb_limit += limit;
587 
588 	cl_limit = nmbclusters;
589 	mclmeta_cache = objcache_create("cluster mbuf", &cl_limit, 0,
590 	    mclmeta_ctor, mclmeta_dtor, NULL,
591 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
592 
593 	limit = nmbclusters;
594 	mbufcluster_cache = objcache_create("mbuf + cluster", &limit, 0,
595 	    mbufcluster_ctor, mbufcluster_dtor, NULL,
596 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
597 	mb_limit += limit;
598 
599 	limit = nmbclusters;
600 	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
601 	    &limit, 64, mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
602 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
603 	mb_limit += limit;
604 
605 	/*
606 	 * Adjust backing kmalloc pools' limit
607 	 *
608 	 * NOTE: We raise the limit by another 1/8 to take the effect
609 	 * of loosememuse into account.
610 	 */
611 	cl_limit += cl_limit / 8;
612 	kmalloc_raise_limit(mclmeta_malloc_args.mtype,
613 			    mclmeta_malloc_args.objsize * cl_limit);
614 	kmalloc_raise_limit(M_MBUFCL, MCLBYTES * cl_limit);
615 
616 	mb_limit += mb_limit / 8;
617 	kmalloc_raise_limit(mbuf_malloc_args.mtype,
618 			    mbuf_malloc_args.objsize * mb_limit);
619 }
620 
621 /*
622  * Return the number of references to this mbuf's data.  0 is returned
623  * if the mbuf is not M_EXT, a reference count is returned if it is
624  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
625  */
626 int
627 m_sharecount(struct mbuf *m)
628 {
629 	switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
630 	case 0:
631 		return (0);
632 	case M_EXT:
633 		return (99);
634 	case M_EXT | M_EXT_CLUSTER:
635 		return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
636 	}
637 	/* NOTREACHED */
638 	return (0);		/* to shut up compiler */
639 }
640 
641 /*
642  * change mbuf to new type
643  */
644 void
645 m_chtype(struct mbuf *m, int type)
646 {
647 	struct globaldata *gd = mycpu;
648 
649 	atomic_add_long_nonlocked(&mbtypes[gd->gd_cpuid][type], 1);
650 	atomic_subtract_long_nonlocked(&mbtypes[gd->gd_cpuid][m->m_type], 1);
651 	atomic_set_short_nonlocked(&m->m_type, type);
652 }
653 
654 static void
655 m_reclaim(void)
656 {
657 	struct domain *dp;
658 	struct protosw *pr;
659 
660 	kprintf("Debug: m_reclaim() called\n");
661 
662 	SLIST_FOREACH(dp, &domains, dom_next) {
663 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
664 			if (pr->pr_drain)
665 				(*pr->pr_drain)();
666 		}
667 	}
668 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_drain, 1);
669 }
670 
671 static void __inline
672 updatestats(struct mbuf *m, int type)
673 {
674 	struct globaldata *gd = mycpu;
675 
676 	m->m_type = type;
677 	mbuftrack(m);
678 #ifdef MBUF_DEBUG
679 	KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m));
680 	KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m));
681 #endif
682 
683 	atomic_add_long_nonlocked(&mbtypes[gd->gd_cpuid][type], 1);
684 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mbufs, 1);
685 
686 }
687 
688 /*
689  * Allocate an mbuf.
690  */
691 struct mbuf *
692 m_get(int how, int type)
693 {
694 	struct mbuf *m;
695 	int ntries = 0;
696 	int ocf = MBTOM(how);
697 
698 retryonce:
699 
700 	m = objcache_get(mbuf_cache, ocf);
701 
702 	if (m == NULL) {
703 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
704 			struct objcache *reclaimlist[] = {
705 				mbufphdr_cache,
706 				mbufcluster_cache,
707 				mbufphdrcluster_cache
708 			};
709 			const int nreclaims = NELEM(reclaimlist);
710 
711 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
712 				m_reclaim();
713 			goto retryonce;
714 		}
715 		++mbstat[mycpu->gd_cpuid].m_drops;
716 		return (NULL);
717 	}
718 #ifdef MBUF_DEBUG
719 	KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m));
720 #endif
721 	m->m_len = 0;
722 
723 	updatestats(m, type);
724 	return (m);
725 }
726 
727 struct mbuf *
728 m_gethdr(int how, int type)
729 {
730 	struct mbuf *m;
731 	int ocf = MBTOM(how);
732 	int ntries = 0;
733 
734 retryonce:
735 
736 	m = objcache_get(mbufphdr_cache, ocf);
737 
738 	if (m == NULL) {
739 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
740 			struct objcache *reclaimlist[] = {
741 				mbuf_cache,
742 				mbufcluster_cache, mbufphdrcluster_cache
743 			};
744 			const int nreclaims = NELEM(reclaimlist);
745 
746 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
747 				m_reclaim();
748 			goto retryonce;
749 		}
750 		++mbstat[mycpu->gd_cpuid].m_drops;
751 		return (NULL);
752 	}
753 #ifdef MBUF_DEBUG
754 	KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m));
755 #endif
756 	m->m_len = 0;
757 	m->m_pkthdr.len = 0;
758 
759 	updatestats(m, type);
760 	return (m);
761 }
762 
763 /*
764  * Get a mbuf (not a mbuf cluster!) and zero it.
765  * Deprecated.
766  */
767 struct mbuf *
768 m_getclr(int how, int type)
769 {
770 	struct mbuf *m;
771 
772 	m = m_get(how, type);
773 	if (m != NULL)
774 		bzero(m->m_data, MLEN);
775 	return (m);
776 }
777 
778 /*
779  * Returns an mbuf with an attached cluster.
780  * Because many network drivers use this kind of buffers a lot, it is
781  * convenient to keep a small pool of free buffers of this kind.
782  * Even a small size such as 10 gives about 10% improvement in the
783  * forwarding rate in a bridge or router.
784  */
785 struct mbuf *
786 m_getcl(int how, short type, int flags)
787 {
788 	struct mbuf *m;
789 	int ocflags = MBTOM(how);
790 	int ntries = 0;
791 
792 retryonce:
793 
794 	if (flags & M_PKTHDR)
795 		m = objcache_get(mbufphdrcluster_cache, ocflags);
796 	else
797 		m = objcache_get(mbufcluster_cache, ocflags);
798 
799 	if (m == NULL) {
800 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
801 			struct objcache *reclaimlist[1];
802 
803 			if (flags & M_PKTHDR)
804 				reclaimlist[0] = mbufcluster_cache;
805 			else
806 				reclaimlist[0] = mbufphdrcluster_cache;
807 			if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
808 				m_reclaim();
809 			goto retryonce;
810 		}
811 		++mbstat[mycpu->gd_cpuid].m_drops;
812 		return (NULL);
813 	}
814 
815 #ifdef MBUF_DEBUG
816 	KASSERT(m->m_data == m->m_ext.ext_buf,
817 		("mbuf %p: bad m_data in get", m));
818 #endif
819 	m->m_type = type;
820 	m->m_len = 0;
821 	m->m_pkthdr.len = 0;	/* just do it unconditonally */
822 
823 	mbuftrack(m);
824 
825 	atomic_add_long_nonlocked(&mbtypes[mycpu->gd_cpuid][type], 1);
826 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
827 	return (m);
828 }
829 
830 /*
831  * Allocate chain of requested length.
832  */
833 struct mbuf *
834 m_getc(int len, int how, int type)
835 {
836 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
837 	int nsize;
838 
839 	while (len > 0) {
840 		n = m_getl(len, how, type, 0, &nsize);
841 		if (n == NULL)
842 			goto failed;
843 		n->m_len = 0;
844 		*ntail = n;
845 		ntail = &n->m_next;
846 		len -= nsize;
847 	}
848 	return (nfirst);
849 
850 failed:
851 	m_freem(nfirst);
852 	return (NULL);
853 }
854 
855 /*
856  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
857  * and return a pointer to the head of the allocated chain. If m0 is
858  * non-null, then we assume that it is a single mbuf or an mbuf chain to
859  * which we want len bytes worth of mbufs and/or clusters attached, and so
860  * if we succeed in allocating it, we will just return a pointer to m0.
861  *
862  * If we happen to fail at any point during the allocation, we will free
863  * up everything we have already allocated and return NULL.
864  *
865  * Deprecated.  Use m_getc() and m_cat() instead.
866  */
867 struct mbuf *
868 m_getm(struct mbuf *m0, int len, int type, int how)
869 {
870 	struct mbuf *nfirst;
871 
872 	nfirst = m_getc(len, how, type);
873 
874 	if (m0 != NULL) {
875 		m_last(m0)->m_next = nfirst;
876 		return (m0);
877 	}
878 
879 	return (nfirst);
880 }
881 
882 /*
883  * Adds a cluster to a normal mbuf, M_EXT is set on success.
884  * Deprecated.  Use m_getcl() instead.
885  */
886 void
887 m_mclget(struct mbuf *m, int how)
888 {
889 	struct mbcluster *mcl;
890 
891 	KKASSERT((m->m_flags & M_EXT) == 0);
892 	mcl = objcache_get(mclmeta_cache, MBTOM(how));
893 	if (mcl != NULL) {
894 		linkcluster(m, mcl);
895 		atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters,
896 					  1);
897 	} else {
898 		++mbstat[mycpu->gd_cpuid].m_drops;
899 	}
900 }
901 
902 /*
903  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
904  * a reference to the cluster can ref it, so we are in no danger of
905  * racing an add with a subtract.  But the operation must still be atomic
906  * since multiple entities may have a reference on the cluster.
907  *
908  * m_mclfree() is almost the same but it must contend with two entities
909  * freeing the cluster at the same time.
910  */
911 static void
912 m_mclref(void *arg)
913 {
914 	struct mbcluster *mcl = arg;
915 
916 	atomic_add_int(&mcl->mcl_refs, 1);
917 }
918 
919 /*
920  * When dereferencing a cluster we have to deal with a N->0 race, where
921  * N entities free their references simultaniously.  To do this we use
922  * atomic_fetchadd_int().
923  */
924 static void
925 m_mclfree(void *arg)
926 {
927 	struct mbcluster *mcl = arg;
928 
929 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1)
930 		objcache_put(mclmeta_cache, mcl);
931 }
932 
933 /*
934  * Free a single mbuf and any associated external storage.  The successor,
935  * if any, is returned.
936  *
937  * We do need to check non-first mbuf for m_aux, since some of existing
938  * code does not call M_PREPEND properly.
939  * (example: call to bpf_mtap from drivers)
940  */
941 
942 #ifdef MBUF_DEBUG
943 
944 struct mbuf  *
945 _m_free(struct mbuf *m, const char *func)
946 
947 #else
948 
949 struct mbuf *
950 m_free(struct mbuf *m)
951 
952 #endif
953 {
954 	struct mbuf *n;
955 	struct globaldata *gd = mycpu;
956 
957 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
958 	KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m));
959 	atomic_subtract_long_nonlocked(&mbtypes[gd->gd_cpuid][m->m_type], 1);
960 
961 	n = m->m_next;
962 
963 	/*
964 	 * Make sure the mbuf is in constructed state before returning it
965 	 * to the objcache.
966 	 */
967 	m->m_next = NULL;
968 	mbufuntrack(m);
969 #ifdef MBUF_DEBUG
970 	m->m_hdr.mh_lastfunc = func;
971 #endif
972 #ifdef notyet
973 	KKASSERT(m->m_nextpkt == NULL);
974 #else
975 	if (m->m_nextpkt != NULL) {
976 		static int afewtimes = 10;
977 
978 		if (afewtimes-- > 0) {
979 			kprintf("mfree: m->m_nextpkt != NULL\n");
980 			print_backtrace(-1);
981 		}
982 		m->m_nextpkt = NULL;
983 	}
984 #endif
985 	if (m->m_flags & M_PKTHDR) {
986 		m_tag_delete_chain(m);		/* eliminate XXX JH */
987 	}
988 
989 	m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
990 
991 	/*
992 	 * Clean the M_PKTHDR state so we can return the mbuf to its original
993 	 * cache.  This is based on the PHCACHE flag which tells us whether
994 	 * the mbuf was originally allocated out of a packet-header cache
995 	 * or a non-packet-header cache.
996 	 */
997 	if (m->m_flags & M_PHCACHE) {
998 		m->m_flags |= M_PKTHDR;
999 		m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
1000 		m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
1001 		m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
1002 		SLIST_INIT(&m->m_pkthdr.tags);
1003 	}
1004 
1005 	/*
1006 	 * Handle remaining flags combinations.  M_CLCACHE tells us whether
1007 	 * the mbuf was originally allocated from a cluster cache or not,
1008 	 * and is totally separate from whether the mbuf is currently
1009 	 * associated with a cluster.
1010 	 */
1011 	switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
1012 	case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
1013 		/*
1014 		 * mbuf+cluster cache case.  The mbuf was allocated from the
1015 		 * combined mbuf_cluster cache and can be returned to the
1016 		 * cache if the cluster hasn't been shared.
1017 		 */
1018 		if (m_sharecount(m) == 1) {
1019 			/*
1020 			 * The cluster has not been shared, we can just
1021 			 * reset the data pointer and return the mbuf
1022 			 * to the cluster cache.  Note that the reference
1023 			 * count is left intact (it is still associated with
1024 			 * an mbuf).
1025 			 */
1026 			m->m_data = m->m_ext.ext_buf;
1027 			if (m->m_flags & M_PHCACHE)
1028 				objcache_put(mbufphdrcluster_cache, m);
1029 			else
1030 				objcache_put(mbufcluster_cache, m);
1031 			atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
1032 		} else {
1033 			/*
1034 			 * Hell.  Someone else has a ref on this cluster,
1035 			 * we have to disconnect it which means we can't
1036 			 * put it back into the mbufcluster_cache, we
1037 			 * have to destroy the mbuf.
1038 			 *
1039 			 * Other mbuf references to the cluster will typically
1040 			 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
1041 			 *
1042 			 * XXX we could try to connect another cluster to
1043 			 * it.
1044 			 */
1045 			m->m_ext.ext_free(m->m_ext.ext_arg);
1046 			m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1047 			if (m->m_flags & M_PHCACHE)
1048 				objcache_dtor(mbufphdrcluster_cache, m);
1049 			else
1050 				objcache_dtor(mbufcluster_cache, m);
1051 		}
1052 		break;
1053 	case M_EXT | M_EXT_CLUSTER:
1054 		/*
1055 		 * Normal cluster associated with an mbuf that was allocated
1056 		 * from the normal mbuf pool rather then the cluster pool.
1057 		 * The cluster has to be independantly disassociated from the
1058 		 * mbuf.
1059 		 */
1060 		if (m_sharecount(m) == 1)
1061 			atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
1062 		/* fall through */
1063 	case M_EXT:
1064 		/*
1065 		 * Normal cluster association case, disconnect the cluster from
1066 		 * the mbuf.  The cluster may or may not be custom.
1067 		 */
1068 		m->m_ext.ext_free(m->m_ext.ext_arg);
1069 		m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1070 		/* fall through */
1071 	case 0:
1072 		/*
1073 		 * return the mbuf to the mbuf cache.
1074 		 */
1075 		if (m->m_flags & M_PHCACHE) {
1076 			m->m_data = m->m_pktdat;
1077 			objcache_put(mbufphdr_cache, m);
1078 		} else {
1079 			m->m_data = m->m_dat;
1080 			objcache_put(mbuf_cache, m);
1081 		}
1082 		atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mbufs, 1);
1083 		break;
1084 	default:
1085 		if (!panicstr)
1086 			panic("bad mbuf flags %p %08x\n", m, m->m_flags);
1087 		break;
1088 	}
1089 	return (n);
1090 }
1091 
1092 #ifdef MBUF_DEBUG
1093 
1094 void
1095 _m_freem(struct mbuf *m, const char *func)
1096 {
1097 	while (m)
1098 		m = _m_free(m, func);
1099 }
1100 
1101 #else
1102 
1103 void
1104 m_freem(struct mbuf *m)
1105 {
1106 	while (m)
1107 		m = m_free(m);
1108 }
1109 
1110 #endif
1111 
1112 /*
1113  * mbuf utility routines
1114  */
1115 
1116 /*
1117  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
1118  * copy junk along.
1119  */
1120 struct mbuf *
1121 m_prepend(struct mbuf *m, int len, int how)
1122 {
1123 	struct mbuf *mn;
1124 
1125 	if (m->m_flags & M_PKTHDR)
1126 	    mn = m_gethdr(how, m->m_type);
1127 	else
1128 	    mn = m_get(how, m->m_type);
1129 	if (mn == NULL) {
1130 		m_freem(m);
1131 		return (NULL);
1132 	}
1133 	if (m->m_flags & M_PKTHDR)
1134 		M_MOVE_PKTHDR(mn, m);
1135 	mn->m_next = m;
1136 	m = mn;
1137 	if (len < MHLEN)
1138 		MH_ALIGN(m, len);
1139 	m->m_len = len;
1140 	return (m);
1141 }
1142 
1143 /*
1144  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1145  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1146  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1147  * Note that the copy is read-only, because clusters are not copied,
1148  * only their reference counts are incremented.
1149  */
1150 struct mbuf *
1151 m_copym(const struct mbuf *m, int off0, int len, int wait)
1152 {
1153 	struct mbuf *n, **np;
1154 	int off = off0;
1155 	struct mbuf *top;
1156 	int copyhdr = 0;
1157 
1158 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1159 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1160 	if (off == 0 && (m->m_flags & M_PKTHDR))
1161 		copyhdr = 1;
1162 	while (off > 0) {
1163 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1164 		if (off < m->m_len)
1165 			break;
1166 		off -= m->m_len;
1167 		m = m->m_next;
1168 	}
1169 	np = &top;
1170 	top = NULL;
1171 	while (len > 0) {
1172 		if (m == NULL) {
1173 			KASSERT(len == M_COPYALL,
1174 			    ("m_copym, length > size of mbuf chain"));
1175 			break;
1176 		}
1177 		/*
1178 		 * Because we are sharing any cluster attachment below,
1179 		 * be sure to get an mbuf that does not have a cluster
1180 		 * associated with it.
1181 		 */
1182 		if (copyhdr)
1183 			n = m_gethdr(wait, m->m_type);
1184 		else
1185 			n = m_get(wait, m->m_type);
1186 		*np = n;
1187 		if (n == NULL)
1188 			goto nospace;
1189 		if (copyhdr) {
1190 			if (!m_dup_pkthdr(n, m, wait))
1191 				goto nospace;
1192 			if (len == M_COPYALL)
1193 				n->m_pkthdr.len -= off0;
1194 			else
1195 				n->m_pkthdr.len = len;
1196 			copyhdr = 0;
1197 		}
1198 		n->m_len = min(len, m->m_len - off);
1199 		if (m->m_flags & M_EXT) {
1200 			KKASSERT((n->m_flags & M_EXT) == 0);
1201 			n->m_data = m->m_data + off;
1202 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1203 			n->m_ext = m->m_ext;
1204 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1205 		} else {
1206 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1207 			    (unsigned)n->m_len);
1208 		}
1209 		if (len != M_COPYALL)
1210 			len -= n->m_len;
1211 		off = 0;
1212 		m = m->m_next;
1213 		np = &n->m_next;
1214 	}
1215 	if (top == NULL)
1216 		atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1217 	return (top);
1218 nospace:
1219 	m_freem(top);
1220 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1221 	return (NULL);
1222 }
1223 
1224 /*
1225  * Copy an entire packet, including header (which must be present).
1226  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1227  * Note that the copy is read-only, because clusters are not copied,
1228  * only their reference counts are incremented.
1229  * Preserve alignment of the first mbuf so if the creator has left
1230  * some room at the beginning (e.g. for inserting protocol headers)
1231  * the copies also have the room available.
1232  */
1233 struct mbuf *
1234 m_copypacket(struct mbuf *m, int how)
1235 {
1236 	struct mbuf *top, *n, *o;
1237 
1238 	n = m_gethdr(how, m->m_type);
1239 	top = n;
1240 	if (!n)
1241 		goto nospace;
1242 
1243 	if (!m_dup_pkthdr(n, m, how))
1244 		goto nospace;
1245 	n->m_len = m->m_len;
1246 	if (m->m_flags & M_EXT) {
1247 		KKASSERT((n->m_flags & M_EXT) == 0);
1248 		n->m_data = m->m_data;
1249 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1250 		n->m_ext = m->m_ext;
1251 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1252 	} else {
1253 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1254 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1255 	}
1256 
1257 	m = m->m_next;
1258 	while (m) {
1259 		o = m_get(how, m->m_type);
1260 		if (!o)
1261 			goto nospace;
1262 
1263 		n->m_next = o;
1264 		n = n->m_next;
1265 
1266 		n->m_len = m->m_len;
1267 		if (m->m_flags & M_EXT) {
1268 			KKASSERT((n->m_flags & M_EXT) == 0);
1269 			n->m_data = m->m_data;
1270 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1271 			n->m_ext = m->m_ext;
1272 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1273 		} else {
1274 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1275 		}
1276 
1277 		m = m->m_next;
1278 	}
1279 	return top;
1280 nospace:
1281 	m_freem(top);
1282 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1283 	return (NULL);
1284 }
1285 
1286 /*
1287  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1288  * continuing for "len" bytes, into the indicated buffer.
1289  */
1290 void
1291 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1292 {
1293 	unsigned count;
1294 
1295 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1296 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1297 	while (off > 0) {
1298 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1299 		if (off < m->m_len)
1300 			break;
1301 		off -= m->m_len;
1302 		m = m->m_next;
1303 	}
1304 	while (len > 0) {
1305 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1306 		count = min(m->m_len - off, len);
1307 		bcopy(mtod(m, caddr_t) + off, cp, count);
1308 		len -= count;
1309 		cp += count;
1310 		off = 0;
1311 		m = m->m_next;
1312 	}
1313 }
1314 
1315 /*
1316  * Copy a packet header mbuf chain into a completely new chain, including
1317  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1318  * you need a writable copy of an mbuf chain.
1319  */
1320 struct mbuf *
1321 m_dup(struct mbuf *m, int how)
1322 {
1323 	struct mbuf **p, *top = NULL;
1324 	int remain, moff, nsize;
1325 
1326 	/* Sanity check */
1327 	if (m == NULL)
1328 		return (NULL);
1329 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1330 
1331 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1332 	remain = m->m_pkthdr.len;
1333 	moff = 0;
1334 	p = &top;
1335 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1336 		struct mbuf *n;
1337 
1338 		/* Get the next new mbuf */
1339 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1340 			   &nsize);
1341 		if (n == NULL)
1342 			goto nospace;
1343 		if (top == NULL)
1344 			if (!m_dup_pkthdr(n, m, how))
1345 				goto nospace0;
1346 
1347 		/* Link it into the new chain */
1348 		*p = n;
1349 		p = &n->m_next;
1350 
1351 		/* Copy data from original mbuf(s) into new mbuf */
1352 		n->m_len = 0;
1353 		while (n->m_len < nsize && m != NULL) {
1354 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1355 
1356 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1357 			moff += chunk;
1358 			n->m_len += chunk;
1359 			remain -= chunk;
1360 			if (moff == m->m_len) {
1361 				m = m->m_next;
1362 				moff = 0;
1363 			}
1364 		}
1365 
1366 		/* Check correct total mbuf length */
1367 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1368 			("%s: bogus m_pkthdr.len", __func__));
1369 	}
1370 	return (top);
1371 
1372 nospace:
1373 	m_freem(top);
1374 nospace0:
1375 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1376 	return (NULL);
1377 }
1378 
1379 /*
1380  * Copy the non-packet mbuf data chain into a new set of mbufs, including
1381  * copying any mbuf clusters.  This is typically used to realign a data
1382  * chain by nfs_realign().
1383  *
1384  * The original chain is left intact.  how should be MB_WAIT or MB_DONTWAIT
1385  * and NULL can be returned if MB_DONTWAIT is passed.
1386  *
1387  * Be careful to use cluster mbufs, a large mbuf chain converted to non
1388  * cluster mbufs can exhaust our supply of mbufs.
1389  */
1390 struct mbuf *
1391 m_dup_data(struct mbuf *m, int how)
1392 {
1393 	struct mbuf **p, *n, *top = NULL;
1394 	int mlen, moff, chunk, gsize, nsize;
1395 
1396 	/*
1397 	 * Degenerate case
1398 	 */
1399 	if (m == NULL)
1400 		return (NULL);
1401 
1402 	/*
1403 	 * Optimize the mbuf allocation but do not get too carried away.
1404 	 */
1405 	if (m->m_next || m->m_len > MLEN)
1406 		gsize = MCLBYTES;
1407 	else
1408 		gsize = MLEN;
1409 
1410 	/* Chain control */
1411 	p = &top;
1412 	n = NULL;
1413 	nsize = 0;
1414 
1415 	/*
1416 	 * Scan the mbuf chain until nothing is left, the new mbuf chain
1417 	 * will be allocated on the fly as needed.
1418 	 */
1419 	while (m) {
1420 		mlen = m->m_len;
1421 		moff = 0;
1422 
1423 		while (mlen) {
1424 			KKASSERT(m->m_type == MT_DATA);
1425 			if (n == NULL) {
1426 				n = m_getl(gsize, how, MT_DATA, 0, &nsize);
1427 				n->m_len = 0;
1428 				if (n == NULL)
1429 					goto nospace;
1430 				*p = n;
1431 				p = &n->m_next;
1432 			}
1433 			chunk = imin(mlen, nsize);
1434 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1435 			mlen -= chunk;
1436 			moff += chunk;
1437 			n->m_len += chunk;
1438 			nsize -= chunk;
1439 			if (nsize == 0)
1440 				n = NULL;
1441 		}
1442 		m = m->m_next;
1443 	}
1444 	*p = NULL;
1445 	return(top);
1446 nospace:
1447 	*p = NULL;
1448 	m_freem(top);
1449 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1450 	return (NULL);
1451 }
1452 
1453 /*
1454  * Concatenate mbuf chain n to m.
1455  * Both chains must be of the same type (e.g. MT_DATA).
1456  * Any m_pkthdr is not updated.
1457  */
1458 void
1459 m_cat(struct mbuf *m, struct mbuf *n)
1460 {
1461 	m = m_last(m);
1462 	while (n) {
1463 		if (m->m_flags & M_EXT ||
1464 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1465 			/* just join the two chains */
1466 			m->m_next = n;
1467 			return;
1468 		}
1469 		/* splat the data from one into the other */
1470 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1471 		    (u_int)n->m_len);
1472 		m->m_len += n->m_len;
1473 		n = m_free(n);
1474 	}
1475 }
1476 
1477 void
1478 m_adj(struct mbuf *mp, int req_len)
1479 {
1480 	int len = req_len;
1481 	struct mbuf *m;
1482 	int count;
1483 
1484 	if ((m = mp) == NULL)
1485 		return;
1486 	if (len >= 0) {
1487 		/*
1488 		 * Trim from head.
1489 		 */
1490 		while (m != NULL && len > 0) {
1491 			if (m->m_len <= len) {
1492 				len -= m->m_len;
1493 				m->m_len = 0;
1494 				m = m->m_next;
1495 			} else {
1496 				m->m_len -= len;
1497 				m->m_data += len;
1498 				len = 0;
1499 			}
1500 		}
1501 		m = mp;
1502 		if (mp->m_flags & M_PKTHDR)
1503 			m->m_pkthdr.len -= (req_len - len);
1504 	} else {
1505 		/*
1506 		 * Trim from tail.  Scan the mbuf chain,
1507 		 * calculating its length and finding the last mbuf.
1508 		 * If the adjustment only affects this mbuf, then just
1509 		 * adjust and return.  Otherwise, rescan and truncate
1510 		 * after the remaining size.
1511 		 */
1512 		len = -len;
1513 		count = 0;
1514 		for (;;) {
1515 			count += m->m_len;
1516 			if (m->m_next == NULL)
1517 				break;
1518 			m = m->m_next;
1519 		}
1520 		if (m->m_len >= len) {
1521 			m->m_len -= len;
1522 			if (mp->m_flags & M_PKTHDR)
1523 				mp->m_pkthdr.len -= len;
1524 			return;
1525 		}
1526 		count -= len;
1527 		if (count < 0)
1528 			count = 0;
1529 		/*
1530 		 * Correct length for chain is "count".
1531 		 * Find the mbuf with last data, adjust its length,
1532 		 * and toss data from remaining mbufs on chain.
1533 		 */
1534 		m = mp;
1535 		if (m->m_flags & M_PKTHDR)
1536 			m->m_pkthdr.len = count;
1537 		for (; m; m = m->m_next) {
1538 			if (m->m_len >= count) {
1539 				m->m_len = count;
1540 				break;
1541 			}
1542 			count -= m->m_len;
1543 		}
1544 		while (m->m_next)
1545 			(m = m->m_next) ->m_len = 0;
1546 	}
1547 }
1548 
1549 /*
1550  * Set the m_data pointer of a newly-allocated mbuf
1551  * to place an object of the specified size at the
1552  * end of the mbuf, longword aligned.
1553  */
1554 void
1555 m_align(struct mbuf *m, int len)
1556 {
1557 	int adjust;
1558 
1559 	if (m->m_flags & M_EXT)
1560 		adjust = m->m_ext.ext_size - len;
1561 	else if (m->m_flags & M_PKTHDR)
1562 		adjust = MHLEN - len;
1563 	else
1564 		adjust = MLEN - len;
1565 	m->m_data += adjust &~ (sizeof(long)-1);
1566 }
1567 
1568 /*
1569  * Rearrange an mbuf chain so that len bytes are contiguous
1570  * and in the data area of an mbuf (so that mtod will work for a structure
1571  * of size len).  Returns the resulting mbuf chain on success, frees it and
1572  * returns null on failure.  If there is room, it will add up to
1573  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1574  * avoid being called next time.
1575  */
1576 struct mbuf *
1577 m_pullup(struct mbuf *n, int len)
1578 {
1579 	struct mbuf *m;
1580 	int count;
1581 	int space;
1582 
1583 	/*
1584 	 * If first mbuf has no cluster, and has room for len bytes
1585 	 * without shifting current data, pullup into it,
1586 	 * otherwise allocate a new mbuf to prepend to the chain.
1587 	 */
1588 	if (!(n->m_flags & M_EXT) &&
1589 	    n->m_data + len < &n->m_dat[MLEN] &&
1590 	    n->m_next) {
1591 		if (n->m_len >= len)
1592 			return (n);
1593 		m = n;
1594 		n = n->m_next;
1595 		len -= m->m_len;
1596 	} else {
1597 		if (len > MHLEN)
1598 			goto bad;
1599 		if (n->m_flags & M_PKTHDR)
1600 			m = m_gethdr(MB_DONTWAIT, n->m_type);
1601 		else
1602 			m = m_get(MB_DONTWAIT, n->m_type);
1603 		if (m == NULL)
1604 			goto bad;
1605 		m->m_len = 0;
1606 		if (n->m_flags & M_PKTHDR)
1607 			M_MOVE_PKTHDR(m, n);
1608 	}
1609 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1610 	do {
1611 		count = min(min(max(len, max_protohdr), space), n->m_len);
1612 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1613 		  (unsigned)count);
1614 		len -= count;
1615 		m->m_len += count;
1616 		n->m_len -= count;
1617 		space -= count;
1618 		if (n->m_len)
1619 			n->m_data += count;
1620 		else
1621 			n = m_free(n);
1622 	} while (len > 0 && n);
1623 	if (len > 0) {
1624 		m_free(m);
1625 		goto bad;
1626 	}
1627 	m->m_next = n;
1628 	return (m);
1629 bad:
1630 	m_freem(n);
1631 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1632 	return (NULL);
1633 }
1634 
1635 /*
1636  * Partition an mbuf chain in two pieces, returning the tail --
1637  * all but the first len0 bytes.  In case of failure, it returns NULL and
1638  * attempts to restore the chain to its original state.
1639  *
1640  * Note that the resulting mbufs might be read-only, because the new
1641  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1642  * the "breaking point" happens to lie within a cluster mbuf. Use the
1643  * M_WRITABLE() macro to check for this case.
1644  */
1645 struct mbuf *
1646 m_split(struct mbuf *m0, int len0, int wait)
1647 {
1648 	struct mbuf *m, *n;
1649 	unsigned len = len0, remain;
1650 
1651 	for (m = m0; m && len > m->m_len; m = m->m_next)
1652 		len -= m->m_len;
1653 	if (m == NULL)
1654 		return (NULL);
1655 	remain = m->m_len - len;
1656 	if (m0->m_flags & M_PKTHDR) {
1657 		n = m_gethdr(wait, m0->m_type);
1658 		if (n == NULL)
1659 			return (NULL);
1660 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1661 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1662 		m0->m_pkthdr.len = len0;
1663 		if (m->m_flags & M_EXT)
1664 			goto extpacket;
1665 		if (remain > MHLEN) {
1666 			/* m can't be the lead packet */
1667 			MH_ALIGN(n, 0);
1668 			n->m_next = m_split(m, len, wait);
1669 			if (n->m_next == NULL) {
1670 				m_free(n);
1671 				return (NULL);
1672 			} else {
1673 				n->m_len = 0;
1674 				return (n);
1675 			}
1676 		} else
1677 			MH_ALIGN(n, remain);
1678 	} else if (remain == 0) {
1679 		n = m->m_next;
1680 		m->m_next = 0;
1681 		return (n);
1682 	} else {
1683 		n = m_get(wait, m->m_type);
1684 		if (n == NULL)
1685 			return (NULL);
1686 		M_ALIGN(n, remain);
1687 	}
1688 extpacket:
1689 	if (m->m_flags & M_EXT) {
1690 		KKASSERT((n->m_flags & M_EXT) == 0);
1691 		n->m_data = m->m_data + len;
1692 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1693 		n->m_ext = m->m_ext;
1694 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1695 	} else {
1696 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1697 	}
1698 	n->m_len = remain;
1699 	m->m_len = len;
1700 	n->m_next = m->m_next;
1701 	m->m_next = 0;
1702 	return (n);
1703 }
1704 
1705 /*
1706  * Routine to copy from device local memory into mbufs.
1707  * Note: "offset" is ill-defined and always called as 0, so ignore it.
1708  */
1709 struct mbuf *
1710 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
1711     void (*copy)(volatile const void *from, volatile void *to, size_t length))
1712 {
1713 	struct mbuf *m, *mfirst = NULL, **mtail;
1714 	int nsize, flags;
1715 
1716 	if (copy == NULL)
1717 		copy = bcopy;
1718 	mtail = &mfirst;
1719 	flags = M_PKTHDR;
1720 
1721 	while (len > 0) {
1722 		m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
1723 		if (m == NULL) {
1724 			m_freem(mfirst);
1725 			return (NULL);
1726 		}
1727 		m->m_len = min(len, nsize);
1728 
1729 		if (flags & M_PKTHDR) {
1730 			if (len + max_linkhdr <= nsize)
1731 				m->m_data += max_linkhdr;
1732 			m->m_pkthdr.rcvif = ifp;
1733 			m->m_pkthdr.len = len;
1734 			flags = 0;
1735 		}
1736 
1737 		copy(buf, m->m_data, (unsigned)m->m_len);
1738 		buf += m->m_len;
1739 		len -= m->m_len;
1740 		*mtail = m;
1741 		mtail = &m->m_next;
1742 	}
1743 
1744 	return (mfirst);
1745 }
1746 
1747 /*
1748  * Routine to pad mbuf to the specified length 'padto'.
1749  */
1750 int
1751 m_devpad(struct mbuf *m, int padto)
1752 {
1753 	struct mbuf *last = NULL;
1754 	int padlen;
1755 
1756 	if (padto <= m->m_pkthdr.len)
1757 		return 0;
1758 
1759 	padlen = padto - m->m_pkthdr.len;
1760 
1761 	/* if there's only the packet-header and we can pad there, use it. */
1762 	if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) {
1763 		last = m;
1764 	} else {
1765 		/*
1766 		 * Walk packet chain to find last mbuf. We will either
1767 		 * pad there, or append a new mbuf and pad it
1768 		 */
1769 		for (last = m; last->m_next != NULL; last = last->m_next)
1770 			; /* EMPTY */
1771 
1772 		/* `last' now points to last in chain. */
1773 		if (M_TRAILINGSPACE(last) < padlen) {
1774 			struct mbuf *n;
1775 
1776 			/* Allocate new empty mbuf, pad it.  Compact later. */
1777 			MGET(n, MB_DONTWAIT, MT_DATA);
1778 			if (n == NULL)
1779 				return ENOBUFS;
1780 			n->m_len = 0;
1781 			last->m_next = n;
1782 			last = n;
1783 		}
1784 	}
1785 	KKASSERT(M_TRAILINGSPACE(last) >= padlen);
1786 	KKASSERT(M_WRITABLE(last));
1787 
1788 	/* Now zero the pad area */
1789 	bzero(mtod(last, char *) + last->m_len, padlen);
1790 	last->m_len += padlen;
1791 	m->m_pkthdr.len += padlen;
1792 	return 0;
1793 }
1794 
1795 /*
1796  * Copy data from a buffer back into the indicated mbuf chain,
1797  * starting "off" bytes from the beginning, extending the mbuf
1798  * chain if necessary.
1799  */
1800 void
1801 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1802 {
1803 	int mlen;
1804 	struct mbuf *m = m0, *n;
1805 	int totlen = 0;
1806 
1807 	if (m0 == NULL)
1808 		return;
1809 	while (off > (mlen = m->m_len)) {
1810 		off -= mlen;
1811 		totlen += mlen;
1812 		if (m->m_next == NULL) {
1813 			n = m_getclr(MB_DONTWAIT, m->m_type);
1814 			if (n == NULL)
1815 				goto out;
1816 			n->m_len = min(MLEN, len + off);
1817 			m->m_next = n;
1818 		}
1819 		m = m->m_next;
1820 	}
1821 	while (len > 0) {
1822 		mlen = min (m->m_len - off, len);
1823 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1824 		cp += mlen;
1825 		len -= mlen;
1826 		mlen += off;
1827 		off = 0;
1828 		totlen += mlen;
1829 		if (len == 0)
1830 			break;
1831 		if (m->m_next == NULL) {
1832 			n = m_get(MB_DONTWAIT, m->m_type);
1833 			if (n == NULL)
1834 				break;
1835 			n->m_len = min(MLEN, len);
1836 			m->m_next = n;
1837 		}
1838 		m = m->m_next;
1839 	}
1840 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1841 		m->m_pkthdr.len = totlen;
1842 }
1843 
1844 /*
1845  * Append the specified data to the indicated mbuf chain,
1846  * Extend the mbuf chain if the new data does not fit in
1847  * existing space.
1848  *
1849  * Return 1 if able to complete the job; otherwise 0.
1850  */
1851 int
1852 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1853 {
1854 	struct mbuf *m, *n;
1855 	int remainder, space;
1856 
1857 	for (m = m0; m->m_next != NULL; m = m->m_next)
1858 		;
1859 	remainder = len;
1860 	space = M_TRAILINGSPACE(m);
1861 	if (space > 0) {
1862 		/*
1863 		 * Copy into available space.
1864 		 */
1865 		if (space > remainder)
1866 			space = remainder;
1867 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1868 		m->m_len += space;
1869 		cp += space, remainder -= space;
1870 	}
1871 	while (remainder > 0) {
1872 		/*
1873 		 * Allocate a new mbuf; could check space
1874 		 * and allocate a cluster instead.
1875 		 */
1876 		n = m_get(MB_DONTWAIT, m->m_type);
1877 		if (n == NULL)
1878 			break;
1879 		n->m_len = min(MLEN, remainder);
1880 		bcopy(cp, mtod(n, caddr_t), n->m_len);
1881 		cp += n->m_len, remainder -= n->m_len;
1882 		m->m_next = n;
1883 		m = n;
1884 	}
1885 	if (m0->m_flags & M_PKTHDR)
1886 		m0->m_pkthdr.len += len - remainder;
1887 	return (remainder == 0);
1888 }
1889 
1890 /*
1891  * Apply function f to the data in an mbuf chain starting "off" bytes from
1892  * the beginning, continuing for "len" bytes.
1893  */
1894 int
1895 m_apply(struct mbuf *m, int off, int len,
1896     int (*f)(void *, void *, u_int), void *arg)
1897 {
1898 	u_int count;
1899 	int rval;
1900 
1901 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1902 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1903 	while (off > 0) {
1904 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1905 		if (off < m->m_len)
1906 			break;
1907 		off -= m->m_len;
1908 		m = m->m_next;
1909 	}
1910 	while (len > 0) {
1911 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1912 		count = min(m->m_len - off, len);
1913 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
1914 		if (rval)
1915 			return (rval);
1916 		len -= count;
1917 		off = 0;
1918 		m = m->m_next;
1919 	}
1920 	return (0);
1921 }
1922 
1923 /*
1924  * Return a pointer to mbuf/offset of location in mbuf chain.
1925  */
1926 struct mbuf *
1927 m_getptr(struct mbuf *m, int loc, int *off)
1928 {
1929 
1930 	while (loc >= 0) {
1931 		/* Normal end of search. */
1932 		if (m->m_len > loc) {
1933 			*off = loc;
1934 			return (m);
1935 		} else {
1936 			loc -= m->m_len;
1937 			if (m->m_next == NULL) {
1938 				if (loc == 0) {
1939 					/* Point at the end of valid data. */
1940 					*off = m->m_len;
1941 					return (m);
1942 				}
1943 				return (NULL);
1944 			}
1945 			m = m->m_next;
1946 		}
1947 	}
1948 	return (NULL);
1949 }
1950 
1951 void
1952 m_print(const struct mbuf *m)
1953 {
1954 	int len;
1955 	const struct mbuf *m2;
1956 
1957 	len = m->m_pkthdr.len;
1958 	m2 = m;
1959 	while (len) {
1960 		kprintf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1961 		len -= m2->m_len;
1962 		m2 = m2->m_next;
1963 	}
1964 	return;
1965 }
1966 
1967 /*
1968  * "Move" mbuf pkthdr from "from" to "to".
1969  * "from" must have M_PKTHDR set, and "to" must be empty.
1970  */
1971 void
1972 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1973 {
1974 	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
1975 
1976 	to->m_flags |= from->m_flags & M_COPYFLAGS;
1977 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
1978 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
1979 }
1980 
1981 /*
1982  * Duplicate "from"'s mbuf pkthdr in "to".
1983  * "from" must have M_PKTHDR set, and "to" must be empty.
1984  * In particular, this does a deep copy of the packet tags.
1985  */
1986 int
1987 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1988 {
1989 	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
1990 
1991 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
1992 		      (to->m_flags & ~M_COPYFLAGS);
1993 	to->m_pkthdr = from->m_pkthdr;
1994 	SLIST_INIT(&to->m_pkthdr.tags);
1995 	return (m_tag_copy_chain(to, from, how));
1996 }
1997 
1998 /*
1999  * Defragment a mbuf chain, returning the shortest possible
2000  * chain of mbufs and clusters.  If allocation fails and
2001  * this cannot be completed, NULL will be returned, but
2002  * the passed in chain will be unchanged.  Upon success,
2003  * the original chain will be freed, and the new chain
2004  * will be returned.
2005  *
2006  * If a non-packet header is passed in, the original
2007  * mbuf (chain?) will be returned unharmed.
2008  *
2009  * m_defrag_nofree doesn't free the passed in mbuf.
2010  */
2011 struct mbuf *
2012 m_defrag(struct mbuf *m0, int how)
2013 {
2014 	struct mbuf *m_new;
2015 
2016 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
2017 		return (NULL);
2018 	if (m_new != m0)
2019 		m_freem(m0);
2020 	return (m_new);
2021 }
2022 
2023 struct mbuf *
2024 m_defrag_nofree(struct mbuf *m0, int how)
2025 {
2026 	struct mbuf	*m_new = NULL, *m_final = NULL;
2027 	int		progress = 0, length, nsize;
2028 
2029 	if (!(m0->m_flags & M_PKTHDR))
2030 		return (m0);
2031 
2032 #ifdef MBUF_STRESS_TEST
2033 	if (m_defragrandomfailures) {
2034 		int temp = karc4random() & 0xff;
2035 		if (temp == 0xba)
2036 			goto nospace;
2037 	}
2038 #endif
2039 
2040 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
2041 	if (m_final == NULL)
2042 		goto nospace;
2043 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
2044 
2045 	if (m_dup_pkthdr(m_final, m0, how) == 0)
2046 		goto nospace;
2047 
2048 	m_new = m_final;
2049 
2050 	while (progress < m0->m_pkthdr.len) {
2051 		length = m0->m_pkthdr.len - progress;
2052 		if (length > MCLBYTES)
2053 			length = MCLBYTES;
2054 
2055 		if (m_new == NULL) {
2056 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
2057 			if (m_new == NULL)
2058 				goto nospace;
2059 		}
2060 
2061 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
2062 		progress += length;
2063 		m_new->m_len = length;
2064 		if (m_new != m_final)
2065 			m_cat(m_final, m_new);
2066 		m_new = NULL;
2067 	}
2068 	if (m0->m_next == NULL)
2069 		m_defraguseless++;
2070 	m_defragpackets++;
2071 	m_defragbytes += m_final->m_pkthdr.len;
2072 	return (m_final);
2073 nospace:
2074 	m_defragfailure++;
2075 	if (m_new)
2076 		m_free(m_new);
2077 	m_freem(m_final);
2078 	return (NULL);
2079 }
2080 
2081 /*
2082  * Move data from uio into mbufs.
2083  */
2084 struct mbuf *
2085 m_uiomove(struct uio *uio)
2086 {
2087 	struct mbuf *m;			/* current working mbuf */
2088 	struct mbuf *head = NULL;	/* result mbuf chain */
2089 	struct mbuf **mp = &head;
2090 	int flags = M_PKTHDR;
2091 	int nsize;
2092 	int error;
2093 	int resid;
2094 
2095 	do {
2096 		if (uio->uio_resid > INT_MAX)
2097 			resid = INT_MAX;
2098 		else
2099 			resid = (int)uio->uio_resid;
2100 		m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
2101 		if (flags) {
2102 			m->m_pkthdr.len = 0;
2103 			/* Leave room for protocol headers. */
2104 			if (resid < MHLEN)
2105 				MH_ALIGN(m, resid);
2106 			flags = 0;
2107 		}
2108 		m->m_len = imin(nsize, resid);
2109 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
2110 		if (error) {
2111 			m_free(m);
2112 			goto failed;
2113 		}
2114 		*mp = m;
2115 		mp = &m->m_next;
2116 		head->m_pkthdr.len += m->m_len;
2117 	} while (uio->uio_resid > 0);
2118 
2119 	return (head);
2120 
2121 failed:
2122 	m_freem(head);
2123 	return (NULL);
2124 }
2125 
2126 struct mbuf *
2127 m_last(struct mbuf *m)
2128 {
2129 	while (m->m_next)
2130 		m = m->m_next;
2131 	return (m);
2132 }
2133 
2134 /*
2135  * Return the number of bytes in an mbuf chain.
2136  * If lastm is not NULL, also return the last mbuf.
2137  */
2138 u_int
2139 m_lengthm(struct mbuf *m, struct mbuf **lastm)
2140 {
2141 	u_int len = 0;
2142 	struct mbuf *prev = m;
2143 
2144 	while (m) {
2145 		len += m->m_len;
2146 		prev = m;
2147 		m = m->m_next;
2148 	}
2149 	if (lastm != NULL)
2150 		*lastm = prev;
2151 	return (len);
2152 }
2153 
2154 /*
2155  * Like m_lengthm(), except also keep track of mbuf usage.
2156  */
2157 u_int
2158 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
2159 {
2160 	u_int len = 0, mbcnt = 0;
2161 	struct mbuf *prev = m;
2162 
2163 	while (m) {
2164 		len += m->m_len;
2165 		mbcnt += MSIZE;
2166 		if (m->m_flags & M_EXT)
2167 			mbcnt += m->m_ext.ext_size;
2168 		prev = m;
2169 		m = m->m_next;
2170 	}
2171 	if (lastm != NULL)
2172 		*lastm = prev;
2173 	*pmbcnt = mbcnt;
2174 	return (len);
2175 }
2176