xref: /dragonfly/sys/kern/uipc_mbuf.c (revision 299d9671)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1991, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
67  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
68  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.70 2008/11/20 14:21:01 sephe Exp $
69  */
70 
71 #include "opt_param.h"
72 #include "opt_mbuf_stress_test.h"
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/objcache.h>
81 #include <sys/tree.h>
82 #include <sys/protosw.h>
83 #include <sys/uio.h>
84 #include <sys/thread.h>
85 #include <sys/globaldata.h>
86 #include <sys/thread2.h>
87 
88 #include <machine/atomic.h>
89 
90 #include <vm/vm.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 
94 #ifdef INVARIANTS
95 #include <machine/cpu.h>
96 #endif
97 
98 /*
99  * mbuf cluster meta-data
100  */
101 struct mbcluster {
102 	int32_t	mcl_refs;
103 	void	*mcl_data;
104 };
105 
106 /*
107  * mbuf tracking for debugging purposes
108  */
109 #ifdef MBUF_DEBUG
110 
111 static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack");
112 
113 struct mbctrack;
114 RB_HEAD(mbuf_rb_tree, mbtrack);
115 RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *);
116 
117 struct mbtrack {
118 	RB_ENTRY(mbtrack) rb_node;
119 	int trackid;
120 	struct mbuf *m;
121 };
122 
123 static int
124 mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2)
125 {
126 	if (mb1->m < mb2->m)
127 		return(-1);
128 	if (mb1->m > mb2->m)
129 		return(1);
130 	return(0);
131 }
132 
133 RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m);
134 
135 struct mbuf_rb_tree	mbuf_track_root;
136 
137 static void
138 mbuftrack(struct mbuf *m)
139 {
140 	struct mbtrack *mbt;
141 
142 	crit_enter();
143 	mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO);
144 	mbt->m = m;
145 	if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt))
146 		panic("mbuftrack: mbuf %p already being tracked\n", m);
147 	crit_exit();
148 }
149 
150 static void
151 mbufuntrack(struct mbuf *m)
152 {
153 	struct mbtrack *mbt;
154 
155 	crit_enter();
156 	mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
157 	if (mbt == NULL) {
158 		kprintf("mbufuntrack: mbuf %p was not tracked\n", m);
159 	} else {
160 		mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt);
161 		kfree(mbt, M_MTRACK);
162 	}
163 	crit_exit();
164 }
165 
166 void
167 mbuftrackid(struct mbuf *m, int trackid)
168 {
169 	struct mbtrack *mbt;
170 	struct mbuf *n;
171 
172 	crit_enter();
173 	while (m) {
174 		n = m->m_nextpkt;
175 		while (m) {
176 			mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
177 			if (mbt)
178 				mbt->trackid = trackid;
179 			m = m->m_next;
180 		}
181 		m = n;
182 	}
183 	crit_exit();
184 }
185 
186 static int
187 mbuftrack_callback(struct mbtrack *mbt, void *arg)
188 {
189 	struct sysctl_req *req = arg;
190 	char buf[64];
191 	int error;
192 
193 	ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid);
194 
195 	error = SYSCTL_OUT(req, buf, strlen(buf));
196 	if (error)
197 		return(-error);
198 	return(0);
199 }
200 
201 static int
202 mbuftrack_show(SYSCTL_HANDLER_ARGS)
203 {
204 	int error;
205 
206 	crit_enter();
207 	error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL,
208 				     mbuftrack_callback, req);
209 	crit_exit();
210 	return (-error);
211 }
212 SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING,
213 	    0, 0, mbuftrack_show, "A", "Show all in-use mbufs");
214 
215 #else
216 
217 #define mbuftrack(m)
218 #define mbufuntrack(m)
219 
220 #endif
221 
222 static void mbinit(void *);
223 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
224 
225 static u_long	mbtypes[SMP_MAXCPU][MT_NTYPES];
226 
227 static struct mbstat mbstat[SMP_MAXCPU];
228 int	max_linkhdr;
229 int	max_protohdr;
230 int	max_hdr;
231 int	max_datalen;
232 int	m_defragpackets;
233 int	m_defragbytes;
234 int	m_defraguseless;
235 int	m_defragfailure;
236 #ifdef MBUF_STRESS_TEST
237 int	m_defragrandomfailures;
238 #endif
239 
240 struct objcache *mbuf_cache, *mbufphdr_cache;
241 struct objcache *mclmeta_cache;
242 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
243 
244 int	nmbclusters;
245 int	nmbufs;
246 
247 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
248 	   &max_linkhdr, 0, "");
249 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
250 	   &max_protohdr, 0, "");
251 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
252 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
253 	   &max_datalen, 0, "");
254 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
255 	   &mbuf_wait, 0, "");
256 static int do_mbstat(SYSCTL_HANDLER_ARGS);
257 
258 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD,
259 	0, 0, do_mbstat, "S,mbstat", "");
260 
261 static int do_mbtypes(SYSCTL_HANDLER_ARGS);
262 
263 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD,
264 	0, 0, do_mbtypes, "LU", "");
265 
266 static int
267 do_mbstat(SYSCTL_HANDLER_ARGS)
268 {
269 	struct mbstat mbstat_total;
270 	struct mbstat *mbstat_totalp;
271 	int i;
272 
273 	bzero(&mbstat_total, sizeof(mbstat_total));
274 	mbstat_totalp = &mbstat_total;
275 
276 	for (i = 0; i < ncpus; i++)
277 	{
278 		mbstat_total.m_mbufs += mbstat[i].m_mbufs;
279 		mbstat_total.m_clusters += mbstat[i].m_clusters;
280 		mbstat_total.m_spare += mbstat[i].m_spare;
281 		mbstat_total.m_clfree += mbstat[i].m_clfree;
282 		mbstat_total.m_drops += mbstat[i].m_drops;
283 		mbstat_total.m_wait += mbstat[i].m_wait;
284 		mbstat_total.m_drain += mbstat[i].m_drain;
285 		mbstat_total.m_mcfail += mbstat[i].m_mcfail;
286 		mbstat_total.m_mpfail += mbstat[i].m_mpfail;
287 
288 	}
289 	/*
290 	 * The following fields are not cumulative fields so just
291 	 * get their values once.
292 	 */
293 	mbstat_total.m_msize = mbstat[0].m_msize;
294 	mbstat_total.m_mclbytes = mbstat[0].m_mclbytes;
295 	mbstat_total.m_minclsize = mbstat[0].m_minclsize;
296 	mbstat_total.m_mlen = mbstat[0].m_mlen;
297 	mbstat_total.m_mhlen = mbstat[0].m_mhlen;
298 
299 	return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req));
300 }
301 
302 static int
303 do_mbtypes(SYSCTL_HANDLER_ARGS)
304 {
305 	u_long totals[MT_NTYPES];
306 	int i, j;
307 
308 	for (i = 0; i < MT_NTYPES; i++)
309 		totals[i] = 0;
310 
311 	for (i = 0; i < ncpus; i++)
312 	{
313 		for (j = 0; j < MT_NTYPES; j++)
314 			totals[j] += mbtypes[i][j];
315 	}
316 
317 	return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req));
318 }
319 
320 /*
321  * These are read-only because we do not currently have any code
322  * to adjust the objcache limits after the fact.  The variables
323  * may only be set as boot-time tunables.
324  */
325 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
326 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
327 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
328 	   "Maximum number of mbufs available");
329 
330 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
331 	   &m_defragpackets, 0, "");
332 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
333 	   &m_defragbytes, 0, "");
334 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
335 	   &m_defraguseless, 0, "");
336 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
337 	   &m_defragfailure, 0, "");
338 #ifdef MBUF_STRESS_TEST
339 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
340 	   &m_defragrandomfailures, 0, "");
341 #endif
342 
343 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
344 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
345 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
346 
347 static void m_reclaim (void);
348 static void m_mclref(void *arg);
349 static void m_mclfree(void *arg);
350 
351 #ifndef NMBCLUSTERS
352 #define NMBCLUSTERS	(512 + maxusers * 16)
353 #endif
354 #ifndef NMBUFS
355 #define NMBUFS		(nmbclusters * 2)
356 #endif
357 
358 /*
359  * Perform sanity checks of tunables declared above.
360  */
361 static void
362 tunable_mbinit(void *dummy)
363 {
364 	/*
365 	 * This has to be done before VM init.
366 	 */
367 	nmbclusters = NMBCLUSTERS;
368 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
369 	nmbufs = NMBUFS;
370 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
371 	/* Sanity checks */
372 	if (nmbufs < nmbclusters * 2)
373 		nmbufs = nmbclusters * 2;
374 }
375 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
376 	tunable_mbinit, NULL);
377 
378 /* "number of clusters of pages" */
379 #define NCL_INIT	1
380 
381 #define NMB_INIT	16
382 
383 /*
384  * The mbuf object cache only guarantees that m_next and m_nextpkt are
385  * NULL and that m_data points to the beginning of the data area.  In
386  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
387  * responsibility of the caller to initialize those fields before use.
388  */
389 
390 static boolean_t __inline
391 mbuf_ctor(void *obj, void *private, int ocflags)
392 {
393 	struct mbuf *m = obj;
394 
395 	m->m_next = NULL;
396 	m->m_nextpkt = NULL;
397 	m->m_data = m->m_dat;
398 	m->m_flags = 0;
399 
400 	return (TRUE);
401 }
402 
403 /*
404  * Initialize the mbuf and the packet header fields.
405  */
406 static boolean_t
407 mbufphdr_ctor(void *obj, void *private, int ocflags)
408 {
409 	struct mbuf *m = obj;
410 
411 	m->m_next = NULL;
412 	m->m_nextpkt = NULL;
413 	m->m_data = m->m_pktdat;
414 	m->m_flags = M_PKTHDR | M_PHCACHE;
415 
416 	m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
417 	SLIST_INIT(&m->m_pkthdr.tags);
418 	m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
419 	m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
420 
421 	return (TRUE);
422 }
423 
424 /*
425  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
426  */
427 static boolean_t
428 mclmeta_ctor(void *obj, void *private, int ocflags)
429 {
430 	struct mbcluster *cl = obj;
431 	void *buf;
432 
433 	if (ocflags & M_NOWAIT)
434 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
435 	else
436 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
437 	if (buf == NULL)
438 		return (FALSE);
439 	cl->mcl_refs = 0;
440 	cl->mcl_data = buf;
441 	return (TRUE);
442 }
443 
444 static void
445 mclmeta_dtor(void *obj, void *private)
446 {
447 	struct mbcluster *mcl = obj;
448 
449 	KKASSERT(mcl->mcl_refs == 0);
450 	kfree(mcl->mcl_data, M_MBUFCL);
451 }
452 
453 static void
454 linkcluster(struct mbuf *m, struct mbcluster *cl)
455 {
456 	/*
457 	 * Add the cluster to the mbuf.  The caller will detect that the
458 	 * mbuf now has an attached cluster.
459 	 */
460 	m->m_ext.ext_arg = cl;
461 	m->m_ext.ext_buf = cl->mcl_data;
462 	m->m_ext.ext_ref = m_mclref;
463 	m->m_ext.ext_free = m_mclfree;
464 	m->m_ext.ext_size = MCLBYTES;
465 	atomic_add_int(&cl->mcl_refs, 1);
466 
467 	m->m_data = m->m_ext.ext_buf;
468 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
469 }
470 
471 static boolean_t
472 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
473 {
474 	struct mbuf *m = obj;
475 	struct mbcluster *cl;
476 
477 	mbufphdr_ctor(obj, private, ocflags);
478 	cl = objcache_get(mclmeta_cache, ocflags);
479 	if (cl == NULL)
480 		return (FALSE);
481 	m->m_flags |= M_CLCACHE;
482 	linkcluster(m, cl);
483 	return (TRUE);
484 }
485 
486 static boolean_t
487 mbufcluster_ctor(void *obj, void *private, int ocflags)
488 {
489 	struct mbuf *m = obj;
490 	struct mbcluster *cl;
491 
492 	mbuf_ctor(obj, private, ocflags);
493 	cl = objcache_get(mclmeta_cache, ocflags);
494 	if (cl == NULL)
495 		return (FALSE);
496 	m->m_flags |= M_CLCACHE;
497 	linkcluster(m, cl);
498 	return (TRUE);
499 }
500 
501 /*
502  * Used for both the cluster and cluster PHDR caches.
503  *
504  * The mbuf may have lost its cluster due to sharing, deal
505  * with the situation by checking M_EXT.
506  */
507 static void
508 mbufcluster_dtor(void *obj, void *private)
509 {
510 	struct mbuf *m = obj;
511 	struct mbcluster *mcl;
512 
513 	if (m->m_flags & M_EXT) {
514 		KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
515 		mcl = m->m_ext.ext_arg;
516 		KKASSERT(mcl->mcl_refs == 1);
517 		mcl->mcl_refs = 0;
518 		objcache_put(mclmeta_cache, mcl);
519 	}
520 }
521 
522 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
523 struct objcache_malloc_args mclmeta_malloc_args =
524 	{ sizeof(struct mbcluster), M_MCLMETA };
525 
526 /* ARGSUSED*/
527 static void
528 mbinit(void *dummy)
529 {
530 	int mb_limit, cl_limit, mbcl_limit;
531 	int limit;
532 	int i;
533 
534 	/*
535 	 * Initialize statistics
536 	 */
537 	for (i = 0; i < ncpus; i++) {
538 		atomic_set_long_nonlocked(&mbstat[i].m_msize, MSIZE);
539 		atomic_set_long_nonlocked(&mbstat[i].m_mclbytes, MCLBYTES);
540 		atomic_set_long_nonlocked(&mbstat[i].m_minclsize, MINCLSIZE);
541 		atomic_set_long_nonlocked(&mbstat[i].m_mlen, MLEN);
542 		atomic_set_long_nonlocked(&mbstat[i].m_mhlen, MHLEN);
543 	}
544 
545 	/*
546 	 * Create objtect caches and save cluster limits, which will
547 	 * be used to adjust backing kmalloc pools' limit later.
548 	 */
549 
550 	mb_limit = cl_limit = mbcl_limit = 0;
551 
552 	limit = nmbufs;
553 	mbuf_cache = objcache_create("mbuf", &limit, 0,
554 	    mbuf_ctor, NULL, NULL,
555 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
556 	if (limit > mb_limit)
557 		mb_limit = limit;
558 
559 	limit = nmbufs;
560 	mbufphdr_cache = objcache_create("mbuf pkt hdr", &limit, 64,
561 	    mbufphdr_ctor, NULL, NULL,
562 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
563 	if (limit > mb_limit)
564 		mb_limit = limit;
565 
566 	cl_limit = nmbclusters;
567 	mclmeta_cache = objcache_create("cluster mbuf", &cl_limit, 0,
568 	    mclmeta_ctor, mclmeta_dtor, NULL,
569 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
570 
571 	limit = nmbclusters;
572 	mbufcluster_cache = objcache_create("mbuf + cluster", &limit, 0,
573 	    mbufcluster_ctor, mbufcluster_dtor, NULL,
574 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
575 	if (limit > mbcl_limit)
576 		mbcl_limit = limit;
577 
578 	limit = nmbclusters;
579 	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
580 	    &limit, 64, mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
581 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
582 	if (limit > mbcl_limit)
583 		mbcl_limit = limit;
584 
585 	/*
586 	 * Adjust backing kmalloc pools' limit
587 	 *
588 	 * NOTE: We raise the limit by another 1/8 to take the effect
589 	 * of loosememuse into account.
590 	 */
591 	cl_limit += cl_limit / 8;
592 	kmalloc_raise_limit(mclmeta_malloc_args.mtype,
593 			    mclmeta_malloc_args.objsize * cl_limit);
594 	kmalloc_raise_limit(M_MBUFCL, MCLBYTES * cl_limit);
595 
596 	mb_limit += mbcl_limit;
597 	mb_limit += mb_limit / 4; /* save some space for non-pkthdr mbufs */
598 	mb_limit += mb_limit / 8;
599 	kmalloc_raise_limit(mbuf_malloc_args.mtype,
600 			    mbuf_malloc_args.objsize * mb_limit);
601 }
602 
603 /*
604  * Return the number of references to this mbuf's data.  0 is returned
605  * if the mbuf is not M_EXT, a reference count is returned if it is
606  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
607  */
608 int
609 m_sharecount(struct mbuf *m)
610 {
611 	switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
612 	case 0:
613 		return (0);
614 	case M_EXT:
615 		return (99);
616 	case M_EXT | M_EXT_CLUSTER:
617 		return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
618 	}
619 	/* NOTREACHED */
620 	return (0);		/* to shut up compiler */
621 }
622 
623 /*
624  * change mbuf to new type
625  */
626 void
627 m_chtype(struct mbuf *m, int type)
628 {
629 	struct globaldata *gd = mycpu;
630 
631 	atomic_add_long_nonlocked(&mbtypes[gd->gd_cpuid][type], 1);
632 	atomic_subtract_long_nonlocked(&mbtypes[gd->gd_cpuid][m->m_type], 1);
633 	atomic_set_short_nonlocked(&m->m_type, type);
634 }
635 
636 static void
637 m_reclaim(void)
638 {
639 	struct domain *dp;
640 	struct protosw *pr;
641 
642 	crit_enter();
643 	SLIST_FOREACH(dp, &domains, dom_next) {
644 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
645 			if (pr->pr_drain)
646 				(*pr->pr_drain)();
647 		}
648 	}
649 	crit_exit();
650 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_drain, 1);
651 }
652 
653 static void __inline
654 updatestats(struct mbuf *m, int type)
655 {
656 	struct globaldata *gd = mycpu;
657 	m->m_type = type;
658 
659 	mbuftrack(m);
660 
661 	atomic_add_long_nonlocked(&mbtypes[gd->gd_cpuid][type], 1);
662 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mbufs, 1);
663 
664 }
665 
666 /*
667  * Allocate an mbuf.
668  */
669 struct mbuf *
670 m_get(int how, int type)
671 {
672 	struct mbuf *m;
673 	int ntries = 0;
674 	int ocf = MBTOM(how);
675 
676 retryonce:
677 
678 	m = objcache_get(mbuf_cache, ocf);
679 
680 	if (m == NULL) {
681 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
682 			struct objcache *reclaimlist[] = {
683 				mbufphdr_cache,
684 				mbufcluster_cache, mbufphdrcluster_cache
685 			};
686 			const int nreclaims = __arysize(reclaimlist);
687 
688 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
689 				m_reclaim();
690 			goto retryonce;
691 		}
692 		return (NULL);
693 	}
694 
695 	updatestats(m, type);
696 	return (m);
697 }
698 
699 struct mbuf *
700 m_gethdr(int how, int type)
701 {
702 	struct mbuf *m;
703 	int ocf = MBTOM(how);
704 	int ntries = 0;
705 
706 retryonce:
707 
708 	m = objcache_get(mbufphdr_cache, ocf);
709 
710 	if (m == NULL) {
711 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
712 			struct objcache *reclaimlist[] = {
713 				mbuf_cache,
714 				mbufcluster_cache, mbufphdrcluster_cache
715 			};
716 			const int nreclaims = __arysize(reclaimlist);
717 
718 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
719 				m_reclaim();
720 			goto retryonce;
721 		}
722 		return (NULL);
723 	}
724 
725 	updatestats(m, type);
726 	return (m);
727 }
728 
729 /*
730  * Get a mbuf (not a mbuf cluster!) and zero it.
731  * Deprecated.
732  */
733 struct mbuf *
734 m_getclr(int how, int type)
735 {
736 	struct mbuf *m;
737 
738 	m = m_get(how, type);
739 	if (m != NULL)
740 		bzero(m->m_data, MLEN);
741 	return (m);
742 }
743 
744 /*
745  * Returns an mbuf with an attached cluster.
746  * Because many network drivers use this kind of buffers a lot, it is
747  * convenient to keep a small pool of free buffers of this kind.
748  * Even a small size such as 10 gives about 10% improvement in the
749  * forwarding rate in a bridge or router.
750  */
751 struct mbuf *
752 m_getcl(int how, short type, int flags)
753 {
754 	struct mbuf *m;
755 	int ocflags = MBTOM(how);
756 	int ntries = 0;
757 
758 retryonce:
759 
760 	if (flags & M_PKTHDR)
761 		m = objcache_get(mbufphdrcluster_cache, ocflags);
762 	else
763 		m = objcache_get(mbufcluster_cache, ocflags);
764 
765 	if (m == NULL) {
766 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
767 			struct objcache *reclaimlist[1];
768 
769 			if (flags & M_PKTHDR)
770 				reclaimlist[0] = mbufcluster_cache;
771 			else
772 				reclaimlist[0] = mbufphdrcluster_cache;
773 			if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
774 				m_reclaim();
775 			goto retryonce;
776 		}
777 		return (NULL);
778 	}
779 
780 	m->m_type = type;
781 
782 	mbuftrack(m);
783 
784 	atomic_add_long_nonlocked(&mbtypes[mycpu->gd_cpuid][type], 1);
785 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
786 	return (m);
787 }
788 
789 /*
790  * Allocate chain of requested length.
791  */
792 struct mbuf *
793 m_getc(int len, int how, int type)
794 {
795 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
796 	int nsize;
797 
798 	while (len > 0) {
799 		n = m_getl(len, how, type, 0, &nsize);
800 		if (n == NULL)
801 			goto failed;
802 		n->m_len = 0;
803 		*ntail = n;
804 		ntail = &n->m_next;
805 		len -= nsize;
806 	}
807 	return (nfirst);
808 
809 failed:
810 	m_freem(nfirst);
811 	return (NULL);
812 }
813 
814 /*
815  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
816  * and return a pointer to the head of the allocated chain. If m0 is
817  * non-null, then we assume that it is a single mbuf or an mbuf chain to
818  * which we want len bytes worth of mbufs and/or clusters attached, and so
819  * if we succeed in allocating it, we will just return a pointer to m0.
820  *
821  * If we happen to fail at any point during the allocation, we will free
822  * up everything we have already allocated and return NULL.
823  *
824  * Deprecated.  Use m_getc() and m_cat() instead.
825  */
826 struct mbuf *
827 m_getm(struct mbuf *m0, int len, int type, int how)
828 {
829 	struct mbuf *nfirst;
830 
831 	nfirst = m_getc(len, how, type);
832 
833 	if (m0 != NULL) {
834 		m_last(m0)->m_next = nfirst;
835 		return (m0);
836 	}
837 
838 	return (nfirst);
839 }
840 
841 /*
842  * Adds a cluster to a normal mbuf, M_EXT is set on success.
843  * Deprecated.  Use m_getcl() instead.
844  */
845 void
846 m_mclget(struct mbuf *m, int how)
847 {
848 	struct mbcluster *mcl;
849 
850 	KKASSERT((m->m_flags & M_EXT) == 0);
851 	mcl = objcache_get(mclmeta_cache, MBTOM(how));
852 	if (mcl != NULL) {
853 		linkcluster(m, mcl);
854 		atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
855 	}
856 }
857 
858 /*
859  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
860  * a reference to the cluster can ref it, so we are in no danger of
861  * racing an add with a subtract.  But the operation must still be atomic
862  * since multiple entities may have a reference on the cluster.
863  *
864  * m_mclfree() is almost the same but it must contend with two entities
865  * freeing the cluster at the same time.  If there is only one reference
866  * count we are the only entity referencing the cluster and no further
867  * locking is required.  Otherwise we must protect against a race to 0
868  * with the serializer.
869  */
870 static void
871 m_mclref(void *arg)
872 {
873 	struct mbcluster *mcl = arg;
874 
875 	atomic_add_int(&mcl->mcl_refs, 1);
876 }
877 
878 /*
879  * When dereferencing a cluster we have to deal with a N->0 race, where
880  * N entities free their references simultaniously.  To do this we use
881  * atomic_fetchadd_int().
882  */
883 static void
884 m_mclfree(void *arg)
885 {
886 	struct mbcluster *mcl = arg;
887 
888 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1)
889 		objcache_put(mclmeta_cache, mcl);
890 }
891 
892 /*
893  * Free a single mbuf and any associated external storage.  The successor,
894  * if any, is returned.
895  *
896  * We do need to check non-first mbuf for m_aux, since some of existing
897  * code does not call M_PREPEND properly.
898  * (example: call to bpf_mtap from drivers)
899  */
900 struct mbuf *
901 m_free(struct mbuf *m)
902 {
903 	struct mbuf *n;
904 	struct globaldata *gd = mycpu;
905 
906 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
907 	atomic_subtract_long_nonlocked(&mbtypes[gd->gd_cpuid][m->m_type], 1);
908 
909 	n = m->m_next;
910 
911 	/*
912 	 * Make sure the mbuf is in constructed state before returning it
913 	 * to the objcache.
914 	 */
915 	m->m_next = NULL;
916 	mbufuntrack(m);
917 #ifdef notyet
918 	KKASSERT(m->m_nextpkt == NULL);
919 #else
920 	if (m->m_nextpkt != NULL) {
921 		static int afewtimes = 10;
922 
923 		if (afewtimes-- > 0) {
924 			kprintf("mfree: m->m_nextpkt != NULL\n");
925 			print_backtrace();
926 		}
927 		m->m_nextpkt = NULL;
928 	}
929 #endif
930 	if (m->m_flags & M_PKTHDR) {
931 		m_tag_delete_chain(m);		/* eliminate XXX JH */
932 	}
933 
934 	m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
935 
936 	/*
937 	 * Clean the M_PKTHDR state so we can return the mbuf to its original
938 	 * cache.  This is based on the PHCACHE flag which tells us whether
939 	 * the mbuf was originally allocated out of a packet-header cache
940 	 * or a non-packet-header cache.
941 	 */
942 	if (m->m_flags & M_PHCACHE) {
943 		m->m_flags |= M_PKTHDR;
944 		m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
945 		m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
946 		m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
947 		SLIST_INIT(&m->m_pkthdr.tags);
948 	}
949 
950 	/*
951 	 * Handle remaining flags combinations.  M_CLCACHE tells us whether
952 	 * the mbuf was originally allocated from a cluster cache or not,
953 	 * and is totally separate from whether the mbuf is currently
954 	 * associated with a cluster.
955 	 */
956 	crit_enter();
957 	switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
958 	case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
959 		/*
960 		 * mbuf+cluster cache case.  The mbuf was allocated from the
961 		 * combined mbuf_cluster cache and can be returned to the
962 		 * cache if the cluster hasn't been shared.
963 		 */
964 		if (m_sharecount(m) == 1) {
965 			/*
966 			 * The cluster has not been shared, we can just
967 			 * reset the data pointer and return the mbuf
968 			 * to the cluster cache.  Note that the reference
969 			 * count is left intact (it is still associated with
970 			 * an mbuf).
971 			 */
972 			m->m_data = m->m_ext.ext_buf;
973 			if (m->m_flags & M_PHCACHE)
974 				objcache_put(mbufphdrcluster_cache, m);
975 			else
976 				objcache_put(mbufcluster_cache, m);
977 			atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
978 		} else {
979 			/*
980 			 * Hell.  Someone else has a ref on this cluster,
981 			 * we have to disconnect it which means we can't
982 			 * put it back into the mbufcluster_cache, we
983 			 * have to destroy the mbuf.
984 			 *
985 			 * Other mbuf references to the cluster will typically
986 			 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
987 			 *
988 			 * XXX we could try to connect another cluster to
989 			 * it.
990 			 */
991 			m->m_ext.ext_free(m->m_ext.ext_arg);
992 			m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
993 			if (m->m_flags & M_PHCACHE)
994 				objcache_dtor(mbufphdrcluster_cache, m);
995 			else
996 				objcache_dtor(mbufcluster_cache, m);
997 		}
998 		break;
999 	case M_EXT | M_EXT_CLUSTER:
1000 		/*
1001 		 * Normal cluster associated with an mbuf that was allocated
1002 		 * from the normal mbuf pool rather then the cluster pool.
1003 		 * The cluster has to be independantly disassociated from the
1004 		 * mbuf.
1005 		 */
1006 		if (m_sharecount(m) == 1)
1007 			atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1);
1008 		/* fall through */
1009 	case M_EXT:
1010 		/*
1011 		 * Normal cluster association case, disconnect the cluster from
1012 		 * the mbuf.  The cluster may or may not be custom.
1013 		 */
1014 		m->m_ext.ext_free(m->m_ext.ext_arg);
1015 		m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1016 		/* fall through */
1017 	case 0:
1018 		/*
1019 		 * return the mbuf to the mbuf cache.
1020 		 */
1021 		if (m->m_flags & M_PHCACHE) {
1022 			m->m_data = m->m_pktdat;
1023 			objcache_put(mbufphdr_cache, m);
1024 		} else {
1025 			m->m_data = m->m_dat;
1026 			objcache_put(mbuf_cache, m);
1027 		}
1028 		atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mbufs, 1);
1029 		break;
1030 	default:
1031 		if (!panicstr)
1032 			panic("bad mbuf flags %p %08x\n", m, m->m_flags);
1033 		break;
1034 	}
1035 	crit_exit();
1036 	return (n);
1037 }
1038 
1039 void
1040 m_freem(struct mbuf *m)
1041 {
1042 	crit_enter();
1043 	while (m)
1044 		m = m_free(m);
1045 	crit_exit();
1046 }
1047 
1048 /*
1049  * mbuf utility routines
1050  */
1051 
1052 /*
1053  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
1054  * copy junk along.
1055  */
1056 struct mbuf *
1057 m_prepend(struct mbuf *m, int len, int how)
1058 {
1059 	struct mbuf *mn;
1060 
1061 	if (m->m_flags & M_PKTHDR)
1062 	    mn = m_gethdr(how, m->m_type);
1063 	else
1064 	    mn = m_get(how, m->m_type);
1065 	if (mn == NULL) {
1066 		m_freem(m);
1067 		return (NULL);
1068 	}
1069 	if (m->m_flags & M_PKTHDR)
1070 		M_MOVE_PKTHDR(mn, m);
1071 	mn->m_next = m;
1072 	m = mn;
1073 	if (len < MHLEN)
1074 		MH_ALIGN(m, len);
1075 	m->m_len = len;
1076 	return (m);
1077 }
1078 
1079 /*
1080  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1081  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1082  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1083  * Note that the copy is read-only, because clusters are not copied,
1084  * only their reference counts are incremented.
1085  */
1086 struct mbuf *
1087 m_copym(const struct mbuf *m, int off0, int len, int wait)
1088 {
1089 	struct mbuf *n, **np;
1090 	int off = off0;
1091 	struct mbuf *top;
1092 	int copyhdr = 0;
1093 
1094 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1095 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1096 	if (off == 0 && m->m_flags & M_PKTHDR)
1097 		copyhdr = 1;
1098 	while (off > 0) {
1099 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1100 		if (off < m->m_len)
1101 			break;
1102 		off -= m->m_len;
1103 		m = m->m_next;
1104 	}
1105 	np = &top;
1106 	top = 0;
1107 	while (len > 0) {
1108 		if (m == NULL) {
1109 			KASSERT(len == M_COPYALL,
1110 			    ("m_copym, length > size of mbuf chain"));
1111 			break;
1112 		}
1113 		/*
1114 		 * Because we are sharing any cluster attachment below,
1115 		 * be sure to get an mbuf that does not have a cluster
1116 		 * associated with it.
1117 		 */
1118 		if (copyhdr)
1119 			n = m_gethdr(wait, m->m_type);
1120 		else
1121 			n = m_get(wait, m->m_type);
1122 		*np = n;
1123 		if (n == NULL)
1124 			goto nospace;
1125 		if (copyhdr) {
1126 			if (!m_dup_pkthdr(n, m, wait))
1127 				goto nospace;
1128 			if (len == M_COPYALL)
1129 				n->m_pkthdr.len -= off0;
1130 			else
1131 				n->m_pkthdr.len = len;
1132 			copyhdr = 0;
1133 		}
1134 		n->m_len = min(len, m->m_len - off);
1135 		if (m->m_flags & M_EXT) {
1136 			KKASSERT((n->m_flags & M_EXT) == 0);
1137 			n->m_data = m->m_data + off;
1138 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1139 			n->m_ext = m->m_ext;
1140 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1141 		} else {
1142 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1143 			    (unsigned)n->m_len);
1144 		}
1145 		if (len != M_COPYALL)
1146 			len -= n->m_len;
1147 		off = 0;
1148 		m = m->m_next;
1149 		np = &n->m_next;
1150 	}
1151 	if (top == NULL)
1152 		atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1153 	return (top);
1154 nospace:
1155 	m_freem(top);
1156 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1157 	return (NULL);
1158 }
1159 
1160 /*
1161  * Copy an entire packet, including header (which must be present).
1162  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1163  * Note that the copy is read-only, because clusters are not copied,
1164  * only their reference counts are incremented.
1165  * Preserve alignment of the first mbuf so if the creator has left
1166  * some room at the beginning (e.g. for inserting protocol headers)
1167  * the copies also have the room available.
1168  */
1169 struct mbuf *
1170 m_copypacket(struct mbuf *m, int how)
1171 {
1172 	struct mbuf *top, *n, *o;
1173 
1174 	n = m_gethdr(how, m->m_type);
1175 	top = n;
1176 	if (!n)
1177 		goto nospace;
1178 
1179 	if (!m_dup_pkthdr(n, m, how))
1180 		goto nospace;
1181 	n->m_len = m->m_len;
1182 	if (m->m_flags & M_EXT) {
1183 		KKASSERT((n->m_flags & M_EXT) == 0);
1184 		n->m_data = m->m_data;
1185 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1186 		n->m_ext = m->m_ext;
1187 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1188 	} else {
1189 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1190 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1191 	}
1192 
1193 	m = m->m_next;
1194 	while (m) {
1195 		o = m_get(how, m->m_type);
1196 		if (!o)
1197 			goto nospace;
1198 
1199 		n->m_next = o;
1200 		n = n->m_next;
1201 
1202 		n->m_len = m->m_len;
1203 		if (m->m_flags & M_EXT) {
1204 			KKASSERT((n->m_flags & M_EXT) == 0);
1205 			n->m_data = m->m_data;
1206 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1207 			n->m_ext = m->m_ext;
1208 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1209 		} else {
1210 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1211 		}
1212 
1213 		m = m->m_next;
1214 	}
1215 	return top;
1216 nospace:
1217 	m_freem(top);
1218 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1219 	return (NULL);
1220 }
1221 
1222 /*
1223  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1224  * continuing for "len" bytes, into the indicated buffer.
1225  */
1226 void
1227 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1228 {
1229 	unsigned count;
1230 
1231 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1232 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1233 	while (off > 0) {
1234 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1235 		if (off < m->m_len)
1236 			break;
1237 		off -= m->m_len;
1238 		m = m->m_next;
1239 	}
1240 	while (len > 0) {
1241 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1242 		count = min(m->m_len - off, len);
1243 		bcopy(mtod(m, caddr_t) + off, cp, count);
1244 		len -= count;
1245 		cp += count;
1246 		off = 0;
1247 		m = m->m_next;
1248 	}
1249 }
1250 
1251 /*
1252  * Copy a packet header mbuf chain into a completely new chain, including
1253  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1254  * you need a writable copy of an mbuf chain.
1255  */
1256 struct mbuf *
1257 m_dup(struct mbuf *m, int how)
1258 {
1259 	struct mbuf **p, *top = NULL;
1260 	int remain, moff, nsize;
1261 
1262 	/* Sanity check */
1263 	if (m == NULL)
1264 		return (NULL);
1265 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1266 
1267 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1268 	remain = m->m_pkthdr.len;
1269 	moff = 0;
1270 	p = &top;
1271 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1272 		struct mbuf *n;
1273 
1274 		/* Get the next new mbuf */
1275 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1276 			   &nsize);
1277 		if (n == NULL)
1278 			goto nospace;
1279 		if (top == NULL)
1280 			if (!m_dup_pkthdr(n, m, how))
1281 				goto nospace0;
1282 
1283 		/* Link it into the new chain */
1284 		*p = n;
1285 		p = &n->m_next;
1286 
1287 		/* Copy data from original mbuf(s) into new mbuf */
1288 		n->m_len = 0;
1289 		while (n->m_len < nsize && m != NULL) {
1290 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1291 
1292 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1293 			moff += chunk;
1294 			n->m_len += chunk;
1295 			remain -= chunk;
1296 			if (moff == m->m_len) {
1297 				m = m->m_next;
1298 				moff = 0;
1299 			}
1300 		}
1301 
1302 		/* Check correct total mbuf length */
1303 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1304 			("%s: bogus m_pkthdr.len", __func__));
1305 	}
1306 	return (top);
1307 
1308 nospace:
1309 	m_freem(top);
1310 nospace0:
1311 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1312 	return (NULL);
1313 }
1314 
1315 /*
1316  * Concatenate mbuf chain n to m.
1317  * Both chains must be of the same type (e.g. MT_DATA).
1318  * Any m_pkthdr is not updated.
1319  */
1320 void
1321 m_cat(struct mbuf *m, struct mbuf *n)
1322 {
1323 	m = m_last(m);
1324 	while (n) {
1325 		if (m->m_flags & M_EXT ||
1326 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1327 			/* just join the two chains */
1328 			m->m_next = n;
1329 			return;
1330 		}
1331 		/* splat the data from one into the other */
1332 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1333 		    (u_int)n->m_len);
1334 		m->m_len += n->m_len;
1335 		n = m_free(n);
1336 	}
1337 }
1338 
1339 void
1340 m_adj(struct mbuf *mp, int req_len)
1341 {
1342 	int len = req_len;
1343 	struct mbuf *m;
1344 	int count;
1345 
1346 	if ((m = mp) == NULL)
1347 		return;
1348 	if (len >= 0) {
1349 		/*
1350 		 * Trim from head.
1351 		 */
1352 		while (m != NULL && len > 0) {
1353 			if (m->m_len <= len) {
1354 				len -= m->m_len;
1355 				m->m_len = 0;
1356 				m = m->m_next;
1357 			} else {
1358 				m->m_len -= len;
1359 				m->m_data += len;
1360 				len = 0;
1361 			}
1362 		}
1363 		m = mp;
1364 		if (mp->m_flags & M_PKTHDR)
1365 			m->m_pkthdr.len -= (req_len - len);
1366 	} else {
1367 		/*
1368 		 * Trim from tail.  Scan the mbuf chain,
1369 		 * calculating its length and finding the last mbuf.
1370 		 * If the adjustment only affects this mbuf, then just
1371 		 * adjust and return.  Otherwise, rescan and truncate
1372 		 * after the remaining size.
1373 		 */
1374 		len = -len;
1375 		count = 0;
1376 		for (;;) {
1377 			count += m->m_len;
1378 			if (m->m_next == (struct mbuf *)0)
1379 				break;
1380 			m = m->m_next;
1381 		}
1382 		if (m->m_len >= len) {
1383 			m->m_len -= len;
1384 			if (mp->m_flags & M_PKTHDR)
1385 				mp->m_pkthdr.len -= len;
1386 			return;
1387 		}
1388 		count -= len;
1389 		if (count < 0)
1390 			count = 0;
1391 		/*
1392 		 * Correct length for chain is "count".
1393 		 * Find the mbuf with last data, adjust its length,
1394 		 * and toss data from remaining mbufs on chain.
1395 		 */
1396 		m = mp;
1397 		if (m->m_flags & M_PKTHDR)
1398 			m->m_pkthdr.len = count;
1399 		for (; m; m = m->m_next) {
1400 			if (m->m_len >= count) {
1401 				m->m_len = count;
1402 				break;
1403 			}
1404 			count -= m->m_len;
1405 		}
1406 		while (m->m_next)
1407 			(m = m->m_next) ->m_len = 0;
1408 	}
1409 }
1410 
1411 /*
1412  * Rearrange an mbuf chain so that len bytes are contiguous
1413  * and in the data area of an mbuf (so that mtod will work for a structure
1414  * of size len).  Returns the resulting mbuf chain on success, frees it and
1415  * returns null on failure.  If there is room, it will add up to
1416  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1417  * avoid being called next time.
1418  */
1419 struct mbuf *
1420 m_pullup(struct mbuf *n, int len)
1421 {
1422 	struct mbuf *m;
1423 	int count;
1424 	int space;
1425 
1426 	/*
1427 	 * If first mbuf has no cluster, and has room for len bytes
1428 	 * without shifting current data, pullup into it,
1429 	 * otherwise allocate a new mbuf to prepend to the chain.
1430 	 */
1431 	if (!(n->m_flags & M_EXT) &&
1432 	    n->m_data + len < &n->m_dat[MLEN] &&
1433 	    n->m_next) {
1434 		if (n->m_len >= len)
1435 			return (n);
1436 		m = n;
1437 		n = n->m_next;
1438 		len -= m->m_len;
1439 	} else {
1440 		if (len > MHLEN)
1441 			goto bad;
1442 		if (n->m_flags & M_PKTHDR)
1443 			m = m_gethdr(MB_DONTWAIT, n->m_type);
1444 		else
1445 			m = m_get(MB_DONTWAIT, n->m_type);
1446 		if (m == NULL)
1447 			goto bad;
1448 		m->m_len = 0;
1449 		if (n->m_flags & M_PKTHDR)
1450 			M_MOVE_PKTHDR(m, n);
1451 	}
1452 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1453 	do {
1454 		count = min(min(max(len, max_protohdr), space), n->m_len);
1455 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1456 		  (unsigned)count);
1457 		len -= count;
1458 		m->m_len += count;
1459 		n->m_len -= count;
1460 		space -= count;
1461 		if (n->m_len)
1462 			n->m_data += count;
1463 		else
1464 			n = m_free(n);
1465 	} while (len > 0 && n);
1466 	if (len > 0) {
1467 		m_free(m);
1468 		goto bad;
1469 	}
1470 	m->m_next = n;
1471 	return (m);
1472 bad:
1473 	m_freem(n);
1474 	atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1);
1475 	return (NULL);
1476 }
1477 
1478 /*
1479  * Partition an mbuf chain in two pieces, returning the tail --
1480  * all but the first len0 bytes.  In case of failure, it returns NULL and
1481  * attempts to restore the chain to its original state.
1482  *
1483  * Note that the resulting mbufs might be read-only, because the new
1484  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1485  * the "breaking point" happens to lie within a cluster mbuf. Use the
1486  * M_WRITABLE() macro to check for this case.
1487  */
1488 struct mbuf *
1489 m_split(struct mbuf *m0, int len0, int wait)
1490 {
1491 	struct mbuf *m, *n;
1492 	unsigned len = len0, remain;
1493 
1494 	for (m = m0; m && len > m->m_len; m = m->m_next)
1495 		len -= m->m_len;
1496 	if (m == NULL)
1497 		return (NULL);
1498 	remain = m->m_len - len;
1499 	if (m0->m_flags & M_PKTHDR) {
1500 		n = m_gethdr(wait, m0->m_type);
1501 		if (n == NULL)
1502 			return (NULL);
1503 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1504 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1505 		m0->m_pkthdr.len = len0;
1506 		if (m->m_flags & M_EXT)
1507 			goto extpacket;
1508 		if (remain > MHLEN) {
1509 			/* m can't be the lead packet */
1510 			MH_ALIGN(n, 0);
1511 			n->m_next = m_split(m, len, wait);
1512 			if (n->m_next == NULL) {
1513 				m_free(n);
1514 				return (NULL);
1515 			} else {
1516 				n->m_len = 0;
1517 				return (n);
1518 			}
1519 		} else
1520 			MH_ALIGN(n, remain);
1521 	} else if (remain == 0) {
1522 		n = m->m_next;
1523 		m->m_next = 0;
1524 		return (n);
1525 	} else {
1526 		n = m_get(wait, m->m_type);
1527 		if (n == NULL)
1528 			return (NULL);
1529 		M_ALIGN(n, remain);
1530 	}
1531 extpacket:
1532 	if (m->m_flags & M_EXT) {
1533 		KKASSERT((n->m_flags & M_EXT) == 0);
1534 		n->m_data = m->m_data + len;
1535 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1536 		n->m_ext = m->m_ext;
1537 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1538 	} else {
1539 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1540 	}
1541 	n->m_len = remain;
1542 	m->m_len = len;
1543 	n->m_next = m->m_next;
1544 	m->m_next = 0;
1545 	return (n);
1546 }
1547 
1548 /*
1549  * Routine to copy from device local memory into mbufs.
1550  * Note: "offset" is ill-defined and always called as 0, so ignore it.
1551  */
1552 struct mbuf *
1553 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
1554     void (*copy)(volatile const void *from, volatile void *to, size_t length))
1555 {
1556 	struct mbuf *m, *mfirst = NULL, **mtail;
1557 	int nsize, flags;
1558 
1559 	if (copy == NULL)
1560 		copy = bcopy;
1561 	mtail = &mfirst;
1562 	flags = M_PKTHDR;
1563 
1564 	while (len > 0) {
1565 		m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
1566 		if (m == NULL) {
1567 			m_freem(mfirst);
1568 			return (NULL);
1569 		}
1570 		m->m_len = min(len, nsize);
1571 
1572 		if (flags & M_PKTHDR) {
1573 			if (len + max_linkhdr <= nsize)
1574 				m->m_data += max_linkhdr;
1575 			m->m_pkthdr.rcvif = ifp;
1576 			m->m_pkthdr.len = len;
1577 			flags = 0;
1578 		}
1579 
1580 		copy(buf, m->m_data, (unsigned)m->m_len);
1581 		buf += m->m_len;
1582 		len -= m->m_len;
1583 		*mtail = m;
1584 		mtail = &m->m_next;
1585 	}
1586 
1587 	return (mfirst);
1588 }
1589 
1590 /*
1591  * Routine to pad mbuf to the specified length 'padto'.
1592  */
1593 int
1594 m_devpad(struct mbuf *m, int padto)
1595 {
1596 	struct mbuf *last = NULL;
1597 	int padlen;
1598 
1599 	if (padto <= m->m_pkthdr.len)
1600 		return 0;
1601 
1602 	padlen = padto - m->m_pkthdr.len;
1603 
1604 	/* if there's only the packet-header and we can pad there, use it. */
1605 	if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) {
1606 		last = m;
1607 	} else {
1608 		/*
1609 		 * Walk packet chain to find last mbuf. We will either
1610 		 * pad there, or append a new mbuf and pad it
1611 		 */
1612 		for (last = m; last->m_next != NULL; last = last->m_next)
1613 			; /* EMPTY */
1614 
1615 		/* `last' now points to last in chain. */
1616 		if (M_TRAILINGSPACE(last) < padlen) {
1617 			struct mbuf *n;
1618 
1619 			/* Allocate new empty mbuf, pad it.  Compact later. */
1620 			MGET(n, MB_DONTWAIT, MT_DATA);
1621 			if (n == NULL)
1622 				return ENOBUFS;
1623 			n->m_len = 0;
1624 			last->m_next = n;
1625 			last = n;
1626 		}
1627 	}
1628 	KKASSERT(M_TRAILINGSPACE(last) >= padlen);
1629 	KKASSERT(M_WRITABLE(last));
1630 
1631 	/* Now zero the pad area */
1632 	bzero(mtod(last, char *) + last->m_len, padlen);
1633 	last->m_len += padlen;
1634 	m->m_pkthdr.len += padlen;
1635 	return 0;
1636 }
1637 
1638 /*
1639  * Copy data from a buffer back into the indicated mbuf chain,
1640  * starting "off" bytes from the beginning, extending the mbuf
1641  * chain if necessary.
1642  */
1643 void
1644 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1645 {
1646 	int mlen;
1647 	struct mbuf *m = m0, *n;
1648 	int totlen = 0;
1649 
1650 	if (m0 == NULL)
1651 		return;
1652 	while (off > (mlen = m->m_len)) {
1653 		off -= mlen;
1654 		totlen += mlen;
1655 		if (m->m_next == NULL) {
1656 			n = m_getclr(MB_DONTWAIT, m->m_type);
1657 			if (n == NULL)
1658 				goto out;
1659 			n->m_len = min(MLEN, len + off);
1660 			m->m_next = n;
1661 		}
1662 		m = m->m_next;
1663 	}
1664 	while (len > 0) {
1665 		mlen = min (m->m_len - off, len);
1666 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1667 		cp += mlen;
1668 		len -= mlen;
1669 		mlen += off;
1670 		off = 0;
1671 		totlen += mlen;
1672 		if (len == 0)
1673 			break;
1674 		if (m->m_next == NULL) {
1675 			n = m_get(MB_DONTWAIT, m->m_type);
1676 			if (n == NULL)
1677 				break;
1678 			n->m_len = min(MLEN, len);
1679 			m->m_next = n;
1680 		}
1681 		m = m->m_next;
1682 	}
1683 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1684 		m->m_pkthdr.len = totlen;
1685 }
1686 
1687 void
1688 m_print(const struct mbuf *m)
1689 {
1690 	int len;
1691 	const struct mbuf *m2;
1692 
1693 	len = m->m_pkthdr.len;
1694 	m2 = m;
1695 	while (len) {
1696 		kprintf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1697 		len -= m2->m_len;
1698 		m2 = m2->m_next;
1699 	}
1700 	return;
1701 }
1702 
1703 /*
1704  * "Move" mbuf pkthdr from "from" to "to".
1705  * "from" must have M_PKTHDR set, and "to" must be empty.
1706  */
1707 void
1708 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1709 {
1710 	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
1711 
1712 	to->m_flags |= from->m_flags & M_COPYFLAGS;
1713 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
1714 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
1715 }
1716 
1717 /*
1718  * Duplicate "from"'s mbuf pkthdr in "to".
1719  * "from" must have M_PKTHDR set, and "to" must be empty.
1720  * In particular, this does a deep copy of the packet tags.
1721  */
1722 int
1723 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1724 {
1725 	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
1726 
1727 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
1728 		      (to->m_flags & ~M_COPYFLAGS);
1729 	to->m_pkthdr = from->m_pkthdr;
1730 	SLIST_INIT(&to->m_pkthdr.tags);
1731 	return (m_tag_copy_chain(to, from, how));
1732 }
1733 
1734 /*
1735  * Defragment a mbuf chain, returning the shortest possible
1736  * chain of mbufs and clusters.  If allocation fails and
1737  * this cannot be completed, NULL will be returned, but
1738  * the passed in chain will be unchanged.  Upon success,
1739  * the original chain will be freed, and the new chain
1740  * will be returned.
1741  *
1742  * If a non-packet header is passed in, the original
1743  * mbuf (chain?) will be returned unharmed.
1744  *
1745  * m_defrag_nofree doesn't free the passed in mbuf.
1746  */
1747 struct mbuf *
1748 m_defrag(struct mbuf *m0, int how)
1749 {
1750 	struct mbuf *m_new;
1751 
1752 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
1753 		return (NULL);
1754 	if (m_new != m0)
1755 		m_freem(m0);
1756 	return (m_new);
1757 }
1758 
1759 struct mbuf *
1760 m_defrag_nofree(struct mbuf *m0, int how)
1761 {
1762 	struct mbuf	*m_new = NULL, *m_final = NULL;
1763 	int		progress = 0, length, nsize;
1764 
1765 	if (!(m0->m_flags & M_PKTHDR))
1766 		return (m0);
1767 
1768 #ifdef MBUF_STRESS_TEST
1769 	if (m_defragrandomfailures) {
1770 		int temp = karc4random() & 0xff;
1771 		if (temp == 0xba)
1772 			goto nospace;
1773 	}
1774 #endif
1775 
1776 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
1777 	if (m_final == NULL)
1778 		goto nospace;
1779 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
1780 
1781 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1782 		goto nospace;
1783 
1784 	m_new = m_final;
1785 
1786 	while (progress < m0->m_pkthdr.len) {
1787 		length = m0->m_pkthdr.len - progress;
1788 		if (length > MCLBYTES)
1789 			length = MCLBYTES;
1790 
1791 		if (m_new == NULL) {
1792 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
1793 			if (m_new == NULL)
1794 				goto nospace;
1795 		}
1796 
1797 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1798 		progress += length;
1799 		m_new->m_len = length;
1800 		if (m_new != m_final)
1801 			m_cat(m_final, m_new);
1802 		m_new = NULL;
1803 	}
1804 	if (m0->m_next == NULL)
1805 		m_defraguseless++;
1806 	m_defragpackets++;
1807 	m_defragbytes += m_final->m_pkthdr.len;
1808 	return (m_final);
1809 nospace:
1810 	m_defragfailure++;
1811 	if (m_new)
1812 		m_free(m_new);
1813 	m_freem(m_final);
1814 	return (NULL);
1815 }
1816 
1817 /*
1818  * Move data from uio into mbufs.
1819  */
1820 struct mbuf *
1821 m_uiomove(struct uio *uio)
1822 {
1823 	struct mbuf *m;			/* current working mbuf */
1824 	struct mbuf *head = NULL;	/* result mbuf chain */
1825 	struct mbuf **mp = &head;
1826 	int resid = uio->uio_resid, nsize, flags = M_PKTHDR, error;
1827 
1828 	do {
1829 		m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
1830 		if (flags) {
1831 			m->m_pkthdr.len = 0;
1832 			/* Leave room for protocol headers. */
1833 			if (resid < MHLEN)
1834 				MH_ALIGN(m, resid);
1835 			flags = 0;
1836 		}
1837 		m->m_len = min(nsize, resid);
1838 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
1839 		if (error) {
1840 			m_free(m);
1841 			goto failed;
1842 		}
1843 		*mp = m;
1844 		mp = &m->m_next;
1845 		head->m_pkthdr.len += m->m_len;
1846 		resid -= m->m_len;
1847 	} while (resid > 0);
1848 
1849 	return (head);
1850 
1851 failed:
1852 	m_freem(head);
1853 	return (NULL);
1854 }
1855 
1856 struct mbuf *
1857 m_last(struct mbuf *m)
1858 {
1859 	while (m->m_next)
1860 		m = m->m_next;
1861 	return (m);
1862 }
1863 
1864 /*
1865  * Return the number of bytes in an mbuf chain.
1866  * If lastm is not NULL, also return the last mbuf.
1867  */
1868 u_int
1869 m_lengthm(struct mbuf *m, struct mbuf **lastm)
1870 {
1871 	u_int len = 0;
1872 	struct mbuf *prev = m;
1873 
1874 	while (m) {
1875 		len += m->m_len;
1876 		prev = m;
1877 		m = m->m_next;
1878 	}
1879 	if (lastm != NULL)
1880 		*lastm = prev;
1881 	return (len);
1882 }
1883 
1884 /*
1885  * Like m_lengthm(), except also keep track of mbuf usage.
1886  */
1887 u_int
1888 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
1889 {
1890 	u_int len = 0, mbcnt = 0;
1891 	struct mbuf *prev = m;
1892 
1893 	while (m) {
1894 		len += m->m_len;
1895 		mbcnt += MSIZE;
1896 		if (m->m_flags & M_EXT)
1897 			mbcnt += m->m_ext.ext_size;
1898 		prev = m;
1899 		m = m->m_next;
1900 	}
1901 	if (lastm != NULL)
1902 		*lastm = prev;
1903 	*pmbcnt = mbcnt;
1904 	return (len);
1905 }
1906