xref: /dragonfly/sys/kern/uipc_mbuf.c (revision b2776052)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
5  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Jeffrey M. Hsu.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * Copyright (c) 1982, 1986, 1988, 1991, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  *
64  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
65  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
66  */
67 
68 #include "opt_param.h"
69 #include "opt_mbuf_stress_test.h"
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/file.h>
73 #include <sys/malloc.h>
74 #include <sys/mbuf.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/domain.h>
78 #include <sys/objcache.h>
79 #include <sys/tree.h>
80 #include <sys/protosw.h>
81 #include <sys/uio.h>
82 #include <sys/thread.h>
83 #include <sys/globaldata.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/spinlock2.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/limits.h>
90 
91 #include <vm/vm.h>
92 #include <vm/vm_kern.h>
93 #include <vm/vm_extern.h>
94 
95 #ifdef INVARIANTS
96 #include <machine/cpu.h>
97 #endif
98 
99 /*
100  * mbuf cluster meta-data
101  */
102 struct mbcluster {
103 	int32_t	mcl_refs;
104 	void	*mcl_data;
105 };
106 
107 /*
108  * mbuf tracking for debugging purposes
109  */
110 #ifdef MBUF_DEBUG
111 
112 static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack");
113 
114 struct mbctrack;
115 RB_HEAD(mbuf_rb_tree, mbtrack);
116 RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *);
117 
118 struct mbtrack {
119 	RB_ENTRY(mbtrack) rb_node;
120 	int trackid;
121 	struct mbuf *m;
122 };
123 
124 static int
125 mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2)
126 {
127 	if (mb1->m < mb2->m)
128 		return(-1);
129 	if (mb1->m > mb2->m)
130 		return(1);
131 	return(0);
132 }
133 
134 RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m);
135 
136 struct mbuf_rb_tree	mbuf_track_root;
137 static struct spinlock	mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin, "mbuf_track_spin");
138 
139 static void
140 mbuftrack(struct mbuf *m)
141 {
142 	struct mbtrack *mbt;
143 
144 	mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO);
145 	spin_lock(&mbuf_track_spin);
146 	mbt->m = m;
147 	if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) {
148 		spin_unlock(&mbuf_track_spin);
149 		panic("mbuftrack: mbuf %p already being tracked", m);
150 	}
151 	spin_unlock(&mbuf_track_spin);
152 }
153 
154 static void
155 mbufuntrack(struct mbuf *m)
156 {
157 	struct mbtrack *mbt;
158 
159 	spin_lock(&mbuf_track_spin);
160 	mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
161 	if (mbt == NULL) {
162 		spin_unlock(&mbuf_track_spin);
163 		panic("mbufuntrack: mbuf %p was not tracked", m);
164 	} else {
165 		mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt);
166 		spin_unlock(&mbuf_track_spin);
167 		kfree(mbt, M_MTRACK);
168 	}
169 }
170 
171 void
172 mbuftrackid(struct mbuf *m, int trackid)
173 {
174 	struct mbtrack *mbt;
175 	struct mbuf *n;
176 
177 	spin_lock(&mbuf_track_spin);
178 	while (m) {
179 		n = m->m_nextpkt;
180 		while (m) {
181 			mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
182 			if (mbt == NULL) {
183 				spin_unlock(&mbuf_track_spin);
184 				panic("mbuftrackid: mbuf %p not tracked", m);
185 			}
186 			mbt->trackid = trackid;
187 			m = m->m_next;
188 		}
189 		m = n;
190 	}
191 	spin_unlock(&mbuf_track_spin);
192 }
193 
194 static int
195 mbuftrack_callback(struct mbtrack *mbt, void *arg)
196 {
197 	struct sysctl_req *req = arg;
198 	char buf[64];
199 	int error;
200 
201 	ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid);
202 
203 	spin_unlock(&mbuf_track_spin);
204 	error = SYSCTL_OUT(req, buf, strlen(buf));
205 	spin_lock(&mbuf_track_spin);
206 	if (error)
207 		return(-error);
208 	return(0);
209 }
210 
211 static int
212 mbuftrack_show(SYSCTL_HANDLER_ARGS)
213 {
214 	int error;
215 
216 	spin_lock(&mbuf_track_spin);
217 	error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL,
218 				     mbuftrack_callback, req);
219 	spin_unlock(&mbuf_track_spin);
220 	return (-error);
221 }
222 SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING,
223 	    0, 0, mbuftrack_show, "A", "Show all in-use mbufs");
224 
225 #else
226 
227 #define mbuftrack(m)
228 #define mbufuntrack(m)
229 
230 #endif
231 
232 static void mbinit(void *);
233 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
234 
235 struct mbtypes_stat {
236 	u_long	stats[MT_NTYPES];
237 } __cachealign;
238 
239 static struct mbtypes_stat	mbtypes[SMP_MAXCPU];
240 
241 static struct mbstat mbstat[SMP_MAXCPU] __cachealign;
242 int	max_linkhdr;
243 int	max_protohdr;
244 int	max_hdr;
245 int	max_datalen;
246 int	m_defragpackets;
247 int	m_defragbytes;
248 int	m_defraguseless;
249 int	m_defragfailure;
250 #ifdef MBUF_STRESS_TEST
251 int	m_defragrandomfailures;
252 #endif
253 
254 struct objcache *mbuf_cache, *mbufphdr_cache;
255 struct objcache *mclmeta_cache, *mjclmeta_cache;
256 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
257 struct objcache *mbufjcluster_cache, *mbufphdrjcluster_cache;
258 
259 int		nmbclusters;
260 static int	nmbjclusters;
261 int		nmbufs;
262 
263 static int	mjclph_cachefrac;
264 static int	mjcl_cachefrac;
265 static int	mclph_cachefrac;
266 static int	mcl_cachefrac;
267 
268 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
269 	&max_linkhdr, 0, "Max size of a link-level header");
270 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
271 	&max_protohdr, 0, "Max size of a protocol header");
272 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0,
273 	"Max size of link+protocol headers");
274 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
275 	&max_datalen, 0, "Max data payload size without headers");
276 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
277 	&mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations");
278 static int do_mbstat(SYSCTL_HANDLER_ARGS);
279 
280 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD,
281 	0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics");
282 
283 static int do_mbtypes(SYSCTL_HANDLER_ARGS);
284 
285 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD,
286 	0, 0, do_mbtypes, "LU", "");
287 
288 static int
289 do_mbstat(SYSCTL_HANDLER_ARGS)
290 {
291 	struct mbstat mbstat_total;
292 	struct mbstat *mbstat_totalp;
293 	int i;
294 
295 	bzero(&mbstat_total, sizeof(mbstat_total));
296 	mbstat_totalp = &mbstat_total;
297 
298 	for (i = 0; i < ncpus; i++)
299 	{
300 		mbstat_total.m_mbufs += mbstat[i].m_mbufs;
301 		mbstat_total.m_clusters += mbstat[i].m_clusters;
302 		mbstat_total.m_jclusters += mbstat[i].m_jclusters;
303 		mbstat_total.m_clfree += mbstat[i].m_clfree;
304 		mbstat_total.m_drops += mbstat[i].m_drops;
305 		mbstat_total.m_wait += mbstat[i].m_wait;
306 		mbstat_total.m_drain += mbstat[i].m_drain;
307 		mbstat_total.m_mcfail += mbstat[i].m_mcfail;
308 		mbstat_total.m_mpfail += mbstat[i].m_mpfail;
309 
310 	}
311 	/*
312 	 * The following fields are not cumulative fields so just
313 	 * get their values once.
314 	 */
315 	mbstat_total.m_msize = mbstat[0].m_msize;
316 	mbstat_total.m_mclbytes = mbstat[0].m_mclbytes;
317 	mbstat_total.m_minclsize = mbstat[0].m_minclsize;
318 	mbstat_total.m_mlen = mbstat[0].m_mlen;
319 	mbstat_total.m_mhlen = mbstat[0].m_mhlen;
320 
321 	return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req));
322 }
323 
324 static int
325 do_mbtypes(SYSCTL_HANDLER_ARGS)
326 {
327 	u_long totals[MT_NTYPES];
328 	int i, j;
329 
330 	for (i = 0; i < MT_NTYPES; i++)
331 		totals[i] = 0;
332 
333 	for (i = 0; i < ncpus; i++)
334 	{
335 		for (j = 0; j < MT_NTYPES; j++)
336 			totals[j] += mbtypes[i].stats[j];
337 	}
338 
339 	return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req));
340 }
341 
342 /*
343  * These are read-only because we do not currently have any code
344  * to adjust the objcache limits after the fact.  The variables
345  * may only be set as boot-time tunables.
346  */
347 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
348 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
349 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
350 	   "Maximum number of mbufs available");
351 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjclusters, CTLFLAG_RD, &nmbjclusters, 0,
352 	   "Maximum number of mbuf jclusters available");
353 SYSCTL_INT(_kern_ipc, OID_AUTO, mjclph_cachefrac, CTLFLAG_RD,
354 	   &mjclph_cachefrac, 0,
355 	   "Fraction of cacheable mbuf jclusters w/ pkthdr");
356 SYSCTL_INT(_kern_ipc, OID_AUTO, mjcl_cachefrac, CTLFLAG_RD,
357 	   &mjcl_cachefrac, 0,
358 	   "Fraction of cacheable mbuf jclusters");
359 SYSCTL_INT(_kern_ipc, OID_AUTO, mclph_cachefrac, CTLFLAG_RD,
360     	   &mclph_cachefrac, 0,
361 	   "Fraction of cacheable mbuf clusters w/ pkthdr");
362 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_cachefrac, CTLFLAG_RD,
363     	   &mcl_cachefrac, 0, "Fraction of cacheable mbuf clusters");
364 
365 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
366 	   &m_defragpackets, 0, "Number of defragment packets");
367 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
368 	   &m_defragbytes, 0, "Number of defragment bytes");
369 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
370 	   &m_defraguseless, 0, "Number of useless defragment mbuf chain operations");
371 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
372 	   &m_defragfailure, 0, "Number of failed defragment mbuf chain operations");
373 #ifdef MBUF_STRESS_TEST
374 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
375 	   &m_defragrandomfailures, 0, "");
376 #endif
377 
378 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
379 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
380 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
381 
382 static void m_reclaim (void);
383 static void m_mclref(void *arg);
384 static void m_mclfree(void *arg);
385 static void m_mjclfree(void *arg);
386 
387 /*
388  * NOTE: Default NMBUFS must take into account a possible DOS attack
389  *	 using fd passing on unix domain sockets.
390  */
391 #ifndef NMBCLUSTERS
392 #define NMBCLUSTERS	(512 + maxusers * 16)
393 #endif
394 #ifndef MJCLPH_CACHEFRAC
395 #define MJCLPH_CACHEFRAC 16
396 #endif
397 #ifndef MJCL_CACHEFRAC
398 #define MJCL_CACHEFRAC	4
399 #endif
400 #ifndef MCLPH_CACHEFRAC
401 #define MCLPH_CACHEFRAC	16
402 #endif
403 #ifndef MCL_CACHEFRAC
404 #define MCL_CACHEFRAC	4
405 #endif
406 #ifndef NMBJCLUSTERS
407 #define NMBJCLUSTERS	(NMBCLUSTERS / 2)
408 #endif
409 #ifndef NMBUFS
410 #define NMBUFS		(nmbclusters * 2 + maxfiles)
411 #endif
412 
413 /*
414  * Perform sanity checks of tunables declared above.
415  */
416 static void
417 tunable_mbinit(void *dummy)
418 {
419 	/*
420 	 * This has to be done before VM init.
421 	 */
422 	nmbclusters = NMBCLUSTERS;
423 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
424 	mjclph_cachefrac = MJCLPH_CACHEFRAC;
425 	TUNABLE_INT_FETCH("kern.ipc.mjclph_cachefrac", &mjclph_cachefrac);
426 	mjcl_cachefrac = MJCL_CACHEFRAC;
427 	TUNABLE_INT_FETCH("kern.ipc.mjcl_cachefrac", &mjcl_cachefrac);
428 	mclph_cachefrac = MCLPH_CACHEFRAC;
429 	TUNABLE_INT_FETCH("kern.ipc.mclph_cachefrac", &mclph_cachefrac);
430 	mcl_cachefrac = MCL_CACHEFRAC;
431 	TUNABLE_INT_FETCH("kern.ipc.mcl_cachefrac", &mcl_cachefrac);
432 
433 	/*
434 	 * WARNING! each mcl cache feeds two mbuf caches, so the minimum
435 	 *	    cachefrac is 2.  For safety, use 3.
436 	 */
437 	if (mjclph_cachefrac < 3)
438 		mjclph_cachefrac = 3;
439 	if (mjcl_cachefrac < 3)
440 		mjcl_cachefrac = 3;
441 	if (mclph_cachefrac < 3)
442 		mclph_cachefrac = 3;
443 	if (mcl_cachefrac < 3)
444 		mcl_cachefrac = 3;
445 
446 	nmbjclusters = NMBJCLUSTERS;
447 	TUNABLE_INT_FETCH("kern.ipc.nmbjclusters", &nmbjclusters);
448 
449 	nmbufs = NMBUFS;
450 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
451 
452 	/* Sanity checks */
453 	if (nmbufs < nmbclusters * 2)
454 		nmbufs = nmbclusters * 2;
455 }
456 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
457 	tunable_mbinit, NULL);
458 
459 /* "number of clusters of pages" */
460 #define NCL_INIT	1
461 
462 #define NMB_INIT	16
463 
464 /*
465  * The mbuf object cache only guarantees that m_next and m_nextpkt are
466  * NULL and that m_data points to the beginning of the data area.  In
467  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
468  * responsibility of the caller to initialize those fields before use.
469  */
470 
471 static __inline boolean_t
472 mbuf_ctor(void *obj, void *private, int ocflags)
473 {
474 	struct mbuf *m = obj;
475 
476 	m->m_next = NULL;
477 	m->m_nextpkt = NULL;
478 	m->m_data = m->m_dat;
479 	m->m_flags = 0;
480 
481 	return (TRUE);
482 }
483 
484 /*
485  * Initialize the mbuf and the packet header fields.
486  */
487 static boolean_t
488 mbufphdr_ctor(void *obj, void *private, int ocflags)
489 {
490 	struct mbuf *m = obj;
491 
492 	m->m_next = NULL;
493 	m->m_nextpkt = NULL;
494 	m->m_data = m->m_pktdat;
495 	m->m_flags = M_PKTHDR | M_PHCACHE;
496 
497 	m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
498 	SLIST_INIT(&m->m_pkthdr.tags);
499 	m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
500 	m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
501 
502 	return (TRUE);
503 }
504 
505 /*
506  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
507  */
508 static boolean_t
509 mclmeta_ctor(void *obj, void *private, int ocflags)
510 {
511 	struct mbcluster *cl = obj;
512 	void *buf;
513 
514 	if (ocflags & M_NOWAIT)
515 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
516 	else
517 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
518 	if (buf == NULL)
519 		return (FALSE);
520 	cl->mcl_refs = 0;
521 	cl->mcl_data = buf;
522 	return (TRUE);
523 }
524 
525 static boolean_t
526 mjclmeta_ctor(void *obj, void *private, int ocflags)
527 {
528 	struct mbcluster *cl = obj;
529 	void *buf;
530 
531 	if (ocflags & M_NOWAIT)
532 		buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_NOWAIT | M_ZERO);
533 	else
534 		buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_INTWAIT | M_ZERO);
535 	if (buf == NULL)
536 		return (FALSE);
537 	cl->mcl_refs = 0;
538 	cl->mcl_data = buf;
539 	return (TRUE);
540 }
541 
542 static void
543 mclmeta_dtor(void *obj, void *private)
544 {
545 	struct mbcluster *mcl = obj;
546 
547 	KKASSERT(mcl->mcl_refs == 0);
548 	kfree(mcl->mcl_data, M_MBUFCL);
549 }
550 
551 static void
552 linkjcluster(struct mbuf *m, struct mbcluster *cl, uint size)
553 {
554 	/*
555 	 * Add the cluster to the mbuf.  The caller will detect that the
556 	 * mbuf now has an attached cluster.
557 	 */
558 	m->m_ext.ext_arg = cl;
559 	m->m_ext.ext_buf = cl->mcl_data;
560 	m->m_ext.ext_ref = m_mclref;
561 	if (size != MCLBYTES)
562 		m->m_ext.ext_free = m_mjclfree;
563 	else
564 		m->m_ext.ext_free = m_mclfree;
565 	m->m_ext.ext_size = size;
566 	atomic_add_int(&cl->mcl_refs, 1);
567 
568 	m->m_data = m->m_ext.ext_buf;
569 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
570 }
571 
572 static void
573 linkcluster(struct mbuf *m, struct mbcluster *cl)
574 {
575 	linkjcluster(m, cl, MCLBYTES);
576 }
577 
578 static boolean_t
579 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
580 {
581 	struct mbuf *m = obj;
582 	struct mbcluster *cl;
583 
584 	mbufphdr_ctor(obj, private, ocflags);
585 	cl = objcache_get(mclmeta_cache, ocflags);
586 	if (cl == NULL) {
587 		++mbstat[mycpu->gd_cpuid].m_drops;
588 		return (FALSE);
589 	}
590 	m->m_flags |= M_CLCACHE;
591 	linkcluster(m, cl);
592 	return (TRUE);
593 }
594 
595 static boolean_t
596 mbufphdrjcluster_ctor(void *obj, void *private, int ocflags)
597 {
598 	struct mbuf *m = obj;
599 	struct mbcluster *cl;
600 
601 	mbufphdr_ctor(obj, private, ocflags);
602 	cl = objcache_get(mjclmeta_cache, ocflags);
603 	if (cl == NULL) {
604 		++mbstat[mycpu->gd_cpuid].m_drops;
605 		return (FALSE);
606 	}
607 	m->m_flags |= M_CLCACHE;
608 	linkjcluster(m, cl, MJUMPAGESIZE);
609 	return (TRUE);
610 }
611 
612 static boolean_t
613 mbufcluster_ctor(void *obj, void *private, int ocflags)
614 {
615 	struct mbuf *m = obj;
616 	struct mbcluster *cl;
617 
618 	mbuf_ctor(obj, private, ocflags);
619 	cl = objcache_get(mclmeta_cache, ocflags);
620 	if (cl == NULL) {
621 		++mbstat[mycpu->gd_cpuid].m_drops;
622 		return (FALSE);
623 	}
624 	m->m_flags |= M_CLCACHE;
625 	linkcluster(m, cl);
626 	return (TRUE);
627 }
628 
629 static boolean_t
630 mbufjcluster_ctor(void *obj, void *private, int ocflags)
631 {
632 	struct mbuf *m = obj;
633 	struct mbcluster *cl;
634 
635 	mbuf_ctor(obj, private, ocflags);
636 	cl = objcache_get(mjclmeta_cache, ocflags);
637 	if (cl == NULL) {
638 		++mbstat[mycpu->gd_cpuid].m_drops;
639 		return (FALSE);
640 	}
641 	m->m_flags |= M_CLCACHE;
642 	linkjcluster(m, cl, MJUMPAGESIZE);
643 	return (TRUE);
644 }
645 
646 /*
647  * Used for both the cluster and cluster PHDR caches.
648  *
649  * The mbuf may have lost its cluster due to sharing, deal
650  * with the situation by checking M_EXT.
651  */
652 static void
653 mbufcluster_dtor(void *obj, void *private)
654 {
655 	struct mbuf *m = obj;
656 	struct mbcluster *mcl;
657 
658 	if (m->m_flags & M_EXT) {
659 		KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
660 		mcl = m->m_ext.ext_arg;
661 		KKASSERT(mcl->mcl_refs == 1);
662 		mcl->mcl_refs = 0;
663 		if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES)
664 			objcache_put(mjclmeta_cache, mcl);
665 		else
666 			objcache_put(mclmeta_cache, mcl);
667 	}
668 }
669 
670 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
671 struct objcache_malloc_args mclmeta_malloc_args =
672 	{ sizeof(struct mbcluster), M_MCLMETA };
673 
674 /* ARGSUSED*/
675 static void
676 mbinit(void *dummy)
677 {
678 	int mb_limit, cl_limit, ncl_limit, jcl_limit;
679 	int limit;
680 	int i;
681 
682 	/*
683 	 * Initialize statistics
684 	 */
685 	for (i = 0; i < ncpus; i++) {
686 		mbstat[i].m_msize = MSIZE;
687 		mbstat[i].m_mclbytes = MCLBYTES;
688 		mbstat[i].m_mjumpagesize = MJUMPAGESIZE;
689 		mbstat[i].m_minclsize = MINCLSIZE;
690 		mbstat[i].m_mlen = MLEN;
691 		mbstat[i].m_mhlen = MHLEN;
692 	}
693 
694 	/*
695 	 * Create objtect caches and save cluster limits, which will
696 	 * be used to adjust backing kmalloc pools' limit later.
697 	 */
698 
699 	mb_limit = cl_limit = 0;
700 
701 	limit = nmbufs;
702 	mbuf_cache = objcache_create("mbuf",
703 	    limit, nmbufs / 4,
704 	    mbuf_ctor, NULL, NULL,
705 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
706 	mb_limit += limit;
707 
708 	limit = nmbufs;
709 	mbufphdr_cache = objcache_create("mbuf pkt hdr",
710 	    limit, nmbufs / 4,
711 	    mbufphdr_ctor, NULL, NULL,
712 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
713 	mb_limit += limit;
714 
715 	ncl_limit = nmbclusters;
716 	mclmeta_cache = objcache_create("cluster mbuf",
717 	    ncl_limit, nmbclusters / 4,
718 	    mclmeta_ctor, mclmeta_dtor, NULL,
719 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
720 	cl_limit += ncl_limit;
721 
722 	jcl_limit = nmbjclusters;
723 	mjclmeta_cache = objcache_create("jcluster mbuf",
724 	    jcl_limit, nmbjclusters / 4,
725 	    mjclmeta_ctor, mclmeta_dtor, NULL,
726 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
727 	cl_limit += jcl_limit;
728 
729 	limit = nmbclusters;
730 	mbufcluster_cache = objcache_create("mbuf + cluster",
731 	    limit, nmbclusters / mcl_cachefrac,
732 	    mbufcluster_ctor, mbufcluster_dtor, NULL,
733 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
734 	mb_limit += limit;
735 
736 	limit = nmbclusters;
737 	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
738 	    limit, nmbclusters / mclph_cachefrac,
739 	    mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
740 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
741 	mb_limit += limit;
742 
743 	limit = nmbjclusters;
744 	mbufjcluster_cache = objcache_create("mbuf + jcluster",
745 	    limit, nmbjclusters / mjcl_cachefrac,
746 	    mbufjcluster_ctor, mbufcluster_dtor, NULL,
747 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
748 	mb_limit += limit;
749 
750 	limit = nmbjclusters;
751 	mbufphdrjcluster_cache = objcache_create("mbuf pkt hdr + jcluster",
752 	    limit, nmbjclusters / mjclph_cachefrac,
753 	    mbufphdrjcluster_ctor, mbufcluster_dtor, NULL,
754 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
755 	mb_limit += limit;
756 
757 	/*
758 	 * Adjust backing kmalloc pools' limit
759 	 *
760 	 * NOTE: We raise the limit by another 1/8 to take the effect
761 	 * of loosememuse into account.
762 	 */
763 	cl_limit += cl_limit / 8;
764 	kmalloc_raise_limit(mclmeta_malloc_args.mtype,
765 			    mclmeta_malloc_args.objsize * (size_t)cl_limit);
766 	kmalloc_raise_limit(M_MBUFCL,
767 			    (MCLBYTES * (size_t)ncl_limit) +
768 			    (MJUMPAGESIZE * (size_t)jcl_limit));
769 
770 	mb_limit += mb_limit / 8;
771 	kmalloc_raise_limit(mbuf_malloc_args.mtype,
772 			    mbuf_malloc_args.objsize * (size_t)mb_limit);
773 }
774 
775 /*
776  * Return the number of references to this mbuf's data.  0 is returned
777  * if the mbuf is not M_EXT, a reference count is returned if it is
778  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
779  */
780 int
781 m_sharecount(struct mbuf *m)
782 {
783 	switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
784 	case 0:
785 		return (0);
786 	case M_EXT:
787 		return (99);
788 	case M_EXT | M_EXT_CLUSTER:
789 		return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
790 	}
791 	/* NOTREACHED */
792 	return (0);		/* to shut up compiler */
793 }
794 
795 /*
796  * change mbuf to new type
797  */
798 void
799 m_chtype(struct mbuf *m, int type)
800 {
801 	struct globaldata *gd = mycpu;
802 
803 	++mbtypes[gd->gd_cpuid].stats[type];
804 	--mbtypes[gd->gd_cpuid].stats[m->m_type];
805 	m->m_type = type;
806 }
807 
808 static void
809 m_reclaim(void)
810 {
811 	struct domain *dp;
812 	struct protosw *pr;
813 
814 	kprintf("Debug: m_reclaim() called\n");
815 
816 	SLIST_FOREACH(dp, &domains, dom_next) {
817 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
818 			if (pr->pr_drain)
819 				(*pr->pr_drain)();
820 		}
821 	}
822 	++mbstat[mycpu->gd_cpuid].m_drain;
823 }
824 
825 static __inline void
826 updatestats(struct mbuf *m, int type)
827 {
828 	struct globaldata *gd = mycpu;
829 
830 	m->m_type = type;
831 	mbuftrack(m);
832 #ifdef MBUF_DEBUG
833 	KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m));
834 	KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m));
835 #endif
836 
837 	++mbtypes[gd->gd_cpuid].stats[type];
838 	++mbstat[gd->gd_cpuid].m_mbufs;
839 
840 }
841 
842 /*
843  * Allocate an mbuf.
844  */
845 struct mbuf *
846 m_get(int how, int type)
847 {
848 	struct mbuf *m;
849 	int ntries = 0;
850 	int ocf = MBTOM(how);
851 
852 retryonce:
853 
854 	m = objcache_get(mbuf_cache, ocf);
855 
856 	if (m == NULL) {
857 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
858 			struct objcache *reclaimlist[] = {
859 				mbufphdr_cache,
860 				mbufcluster_cache,
861 				mbufphdrcluster_cache,
862 				mbufjcluster_cache,
863 				mbufphdrjcluster_cache
864 			};
865 			const int nreclaims = NELEM(reclaimlist);
866 
867 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
868 				m_reclaim();
869 			goto retryonce;
870 		}
871 		++mbstat[mycpu->gd_cpuid].m_drops;
872 		return (NULL);
873 	}
874 #ifdef MBUF_DEBUG
875 	KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m));
876 #endif
877 	m->m_len = 0;
878 
879 	updatestats(m, type);
880 	return (m);
881 }
882 
883 struct mbuf *
884 m_gethdr(int how, int type)
885 {
886 	struct mbuf *m;
887 	int ocf = MBTOM(how);
888 	int ntries = 0;
889 
890 retryonce:
891 
892 	m = objcache_get(mbufphdr_cache, ocf);
893 
894 	if (m == NULL) {
895 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
896 			struct objcache *reclaimlist[] = {
897 				mbuf_cache,
898 				mbufcluster_cache, mbufphdrcluster_cache,
899 				mbufjcluster_cache, mbufphdrjcluster_cache
900 			};
901 			const int nreclaims = NELEM(reclaimlist);
902 
903 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
904 				m_reclaim();
905 			goto retryonce;
906 		}
907 		++mbstat[mycpu->gd_cpuid].m_drops;
908 		return (NULL);
909 	}
910 #ifdef MBUF_DEBUG
911 	KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m));
912 #endif
913 	m->m_len = 0;
914 	m->m_pkthdr.len = 0;
915 
916 	updatestats(m, type);
917 	return (m);
918 }
919 
920 /*
921  * Get a mbuf (not a mbuf cluster!) and zero it.
922  * Deprecated.
923  */
924 struct mbuf *
925 m_getclr(int how, int type)
926 {
927 	struct mbuf *m;
928 
929 	m = m_get(how, type);
930 	if (m != NULL)
931 		bzero(m->m_data, MLEN);
932 	return (m);
933 }
934 
935 static struct mbuf *
936 m_getcl_cache(int how, short type, int flags, struct objcache *mbclc,
937     struct objcache *mbphclc, u_long *cl_stats)
938 {
939 	struct mbuf *m = NULL;
940 	int ocflags = MBTOM(how);
941 	int ntries = 0;
942 
943 retryonce:
944 
945 	if (flags & M_PKTHDR)
946 		m = objcache_get(mbphclc, ocflags);
947 	else
948 		m = objcache_get(mbclc, ocflags);
949 
950 	if (m == NULL) {
951 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
952 			struct objcache *reclaimlist[1];
953 
954 			if (flags & M_PKTHDR)
955 				reclaimlist[0] = mbclc;
956 			else
957 				reclaimlist[0] = mbphclc;
958 			if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
959 				m_reclaim();
960 			goto retryonce;
961 		}
962 		++mbstat[mycpu->gd_cpuid].m_drops;
963 		return (NULL);
964 	}
965 
966 #ifdef MBUF_DEBUG
967 	KASSERT(m->m_data == m->m_ext.ext_buf,
968 		("mbuf %p: bad m_data in get", m));
969 #endif
970 	m->m_type = type;
971 	m->m_len = 0;
972 	m->m_pkthdr.len = 0;	/* just do it unconditonally */
973 
974 	mbuftrack(m);
975 
976 	++mbtypes[mycpu->gd_cpuid].stats[type];
977 	++(*cl_stats);
978 	return (m);
979 }
980 
981 struct mbuf *
982 m_getjcl(int how, short type, int flags, size_t size)
983 {
984 	struct objcache *mbclc, *mbphclc;
985 	u_long *cl_stats;
986 
987 	switch (size) {
988 	case MCLBYTES:
989 		mbclc = mbufcluster_cache;
990 		mbphclc = mbufphdrcluster_cache;
991 		cl_stats = &mbstat[mycpu->gd_cpuid].m_clusters;
992 		break;
993 
994 	default:
995 		mbclc = mbufjcluster_cache;
996 		mbphclc = mbufphdrjcluster_cache;
997 		cl_stats = &mbstat[mycpu->gd_cpuid].m_jclusters;
998 		break;
999 	}
1000 	return m_getcl_cache(how, type, flags, mbclc, mbphclc, cl_stats);
1001 }
1002 
1003 /*
1004  * Returns an mbuf with an attached cluster.
1005  * Because many network drivers use this kind of buffers a lot, it is
1006  * convenient to keep a small pool of free buffers of this kind.
1007  * Even a small size such as 10 gives about 10% improvement in the
1008  * forwarding rate in a bridge or router.
1009  */
1010 struct mbuf *
1011 m_getcl(int how, short type, int flags)
1012 {
1013 	return m_getcl_cache(how, type, flags,
1014 	    mbufcluster_cache, mbufphdrcluster_cache,
1015 	    &mbstat[mycpu->gd_cpuid].m_clusters);
1016 }
1017 
1018 /*
1019  * Allocate chain of requested length.
1020  */
1021 struct mbuf *
1022 m_getc(int len, int how, int type)
1023 {
1024 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
1025 	int nsize;
1026 
1027 	while (len > 0) {
1028 		n = m_getl(len, how, type, 0, &nsize);
1029 		if (n == NULL)
1030 			goto failed;
1031 		n->m_len = 0;
1032 		*ntail = n;
1033 		ntail = &n->m_next;
1034 		len -= nsize;
1035 	}
1036 	return (nfirst);
1037 
1038 failed:
1039 	m_freem(nfirst);
1040 	return (NULL);
1041 }
1042 
1043 /*
1044  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
1045  * and return a pointer to the head of the allocated chain. If m0 is
1046  * non-null, then we assume that it is a single mbuf or an mbuf chain to
1047  * which we want len bytes worth of mbufs and/or clusters attached, and so
1048  * if we succeed in allocating it, we will just return a pointer to m0.
1049  *
1050  * If we happen to fail at any point during the allocation, we will free
1051  * up everything we have already allocated and return NULL.
1052  *
1053  * Deprecated.  Use m_getc() and m_cat() instead.
1054  */
1055 struct mbuf *
1056 m_getm(struct mbuf *m0, int len, int type, int how)
1057 {
1058 	struct mbuf *nfirst;
1059 
1060 	nfirst = m_getc(len, how, type);
1061 
1062 	if (m0 != NULL) {
1063 		m_last(m0)->m_next = nfirst;
1064 		return (m0);
1065 	}
1066 
1067 	return (nfirst);
1068 }
1069 
1070 /*
1071  * Adds a cluster to a normal mbuf, M_EXT is set on success.
1072  * Deprecated.  Use m_getcl() instead.
1073  */
1074 void
1075 m_mclget(struct mbuf *m, int how)
1076 {
1077 	struct mbcluster *mcl;
1078 
1079 	KKASSERT((m->m_flags & M_EXT) == 0);
1080 	mcl = objcache_get(mclmeta_cache, MBTOM(how));
1081 	if (mcl != NULL) {
1082 		linkcluster(m, mcl);
1083 		++mbstat[mycpu->gd_cpuid].m_clusters;
1084 	} else {
1085 		++mbstat[mycpu->gd_cpuid].m_drops;
1086 	}
1087 }
1088 
1089 /*
1090  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
1091  * a reference to the cluster can ref it, so we are in no danger of
1092  * racing an add with a subtract.  But the operation must still be atomic
1093  * since multiple entities may have a reference on the cluster.
1094  *
1095  * m_mclfree() is almost the same but it must contend with two entities
1096  * freeing the cluster at the same time.
1097  */
1098 static void
1099 m_mclref(void *arg)
1100 {
1101 	struct mbcluster *mcl = arg;
1102 
1103 	atomic_add_int(&mcl->mcl_refs, 1);
1104 }
1105 
1106 /*
1107  * When dereferencing a cluster we have to deal with a N->0 race, where
1108  * N entities free their references simultaniously.  To do this we use
1109  * atomic_fetchadd_int().
1110  */
1111 static void
1112 m_mclfree(void *arg)
1113 {
1114 	struct mbcluster *mcl = arg;
1115 
1116 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1117 		--mbstat[mycpu->gd_cpuid].m_clusters;
1118 		objcache_put(mclmeta_cache, mcl);
1119 	}
1120 }
1121 
1122 static void
1123 m_mjclfree(void *arg)
1124 {
1125 	struct mbcluster *mcl = arg;
1126 
1127 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1128 		--mbstat[mycpu->gd_cpuid].m_jclusters;
1129 		objcache_put(mjclmeta_cache, mcl);
1130 	}
1131 }
1132 
1133 /*
1134  * Free a single mbuf and any associated external storage.  The successor,
1135  * if any, is returned.
1136  *
1137  * We do need to check non-first mbuf for m_aux, since some of existing
1138  * code does not call M_PREPEND properly.
1139  * (example: call to bpf_mtap from drivers)
1140  */
1141 
1142 #ifdef MBUF_DEBUG
1143 
1144 struct mbuf  *
1145 _m_free(struct mbuf *m, const char *func)
1146 
1147 #else
1148 
1149 struct mbuf *
1150 m_free(struct mbuf *m)
1151 
1152 #endif
1153 {
1154 	struct mbuf *n;
1155 	struct globaldata *gd = mycpu;
1156 
1157 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
1158 	KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m));
1159 	--mbtypes[gd->gd_cpuid].stats[m->m_type];
1160 
1161 	n = m->m_next;
1162 
1163 	/*
1164 	 * Make sure the mbuf is in constructed state before returning it
1165 	 * to the objcache.
1166 	 */
1167 	m->m_next = NULL;
1168 	mbufuntrack(m);
1169 #ifdef MBUF_DEBUG
1170 	m->m_hdr.mh_lastfunc = func;
1171 #endif
1172 #ifdef notyet
1173 	KKASSERT(m->m_nextpkt == NULL);
1174 #else
1175 	if (m->m_nextpkt != NULL) {
1176 		static int afewtimes = 10;
1177 
1178 		if (afewtimes-- > 0) {
1179 			kprintf("mfree: m->m_nextpkt != NULL\n");
1180 			print_backtrace(-1);
1181 		}
1182 		m->m_nextpkt = NULL;
1183 	}
1184 #endif
1185 	if (m->m_flags & M_PKTHDR) {
1186 		m_tag_delete_chain(m);		/* eliminate XXX JH */
1187 	}
1188 
1189 	m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
1190 
1191 	/*
1192 	 * Clean the M_PKTHDR state so we can return the mbuf to its original
1193 	 * cache.  This is based on the PHCACHE flag which tells us whether
1194 	 * the mbuf was originally allocated out of a packet-header cache
1195 	 * or a non-packet-header cache.
1196 	 */
1197 	if (m->m_flags & M_PHCACHE) {
1198 		m->m_flags |= M_PKTHDR;
1199 		m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
1200 		m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
1201 		m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
1202 		SLIST_INIT(&m->m_pkthdr.tags);
1203 	}
1204 
1205 	/*
1206 	 * Handle remaining flags combinations.  M_CLCACHE tells us whether
1207 	 * the mbuf was originally allocated from a cluster cache or not,
1208 	 * and is totally separate from whether the mbuf is currently
1209 	 * associated with a cluster.
1210 	 */
1211 	switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
1212 	case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
1213 		/*
1214 		 * mbuf+cluster cache case.  The mbuf was allocated from the
1215 		 * combined mbuf_cluster cache and can be returned to the
1216 		 * cache if the cluster hasn't been shared.
1217 		 */
1218 		if (m_sharecount(m) == 1) {
1219 			/*
1220 			 * The cluster has not been shared, we can just
1221 			 * reset the data pointer and return the mbuf
1222 			 * to the cluster cache.  Note that the reference
1223 			 * count is left intact (it is still associated with
1224 			 * an mbuf).
1225 			 */
1226 			m->m_data = m->m_ext.ext_buf;
1227 			if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) {
1228 				if (m->m_flags & M_PHCACHE)
1229 					objcache_put(mbufphdrjcluster_cache, m);
1230 				else
1231 					objcache_put(mbufjcluster_cache, m);
1232 				--mbstat[mycpu->gd_cpuid].m_jclusters;
1233 			} else {
1234 				if (m->m_flags & M_PHCACHE)
1235 					objcache_put(mbufphdrcluster_cache, m);
1236 				else
1237 					objcache_put(mbufcluster_cache, m);
1238 				--mbstat[mycpu->gd_cpuid].m_clusters;
1239 			}
1240 		} else {
1241 			/*
1242 			 * Hell.  Someone else has a ref on this cluster,
1243 			 * we have to disconnect it which means we can't
1244 			 * put it back into the mbufcluster_cache, we
1245 			 * have to destroy the mbuf.
1246 			 *
1247 			 * Other mbuf references to the cluster will typically
1248 			 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
1249 			 *
1250 			 * XXX we could try to connect another cluster to
1251 			 * it.
1252 			 */
1253 			m->m_ext.ext_free(m->m_ext.ext_arg);
1254 			m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1255 			if (m->m_ext.ext_size == MCLBYTES) {
1256 				if (m->m_flags & M_PHCACHE)
1257 					objcache_dtor(mbufphdrcluster_cache, m);
1258 				else
1259 					objcache_dtor(mbufcluster_cache, m);
1260 			} else {
1261 				if (m->m_flags & M_PHCACHE)
1262 					objcache_dtor(mbufphdrjcluster_cache, m);
1263 				else
1264 					objcache_dtor(mbufjcluster_cache, m);
1265 			}
1266 		}
1267 		break;
1268 	case M_EXT | M_EXT_CLUSTER:
1269 	case M_EXT:
1270 		/*
1271 		 * Normal cluster association case, disconnect the cluster from
1272 		 * the mbuf.  The cluster may or may not be custom.
1273 		 */
1274 		m->m_ext.ext_free(m->m_ext.ext_arg);
1275 		m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1276 		/* fall through */
1277 	case 0:
1278 		/*
1279 		 * return the mbuf to the mbuf cache.
1280 		 */
1281 		if (m->m_flags & M_PHCACHE) {
1282 			m->m_data = m->m_pktdat;
1283 			objcache_put(mbufphdr_cache, m);
1284 		} else {
1285 			m->m_data = m->m_dat;
1286 			objcache_put(mbuf_cache, m);
1287 		}
1288 		--mbstat[mycpu->gd_cpuid].m_mbufs;
1289 		break;
1290 	default:
1291 		if (!panicstr)
1292 			panic("bad mbuf flags %p %08x", m, m->m_flags);
1293 		break;
1294 	}
1295 	return (n);
1296 }
1297 
1298 #ifdef MBUF_DEBUG
1299 
1300 void
1301 _m_freem(struct mbuf *m, const char *func)
1302 {
1303 	while (m)
1304 		m = _m_free(m, func);
1305 }
1306 
1307 #else
1308 
1309 void
1310 m_freem(struct mbuf *m)
1311 {
1312 	while (m)
1313 		m = m_free(m);
1314 }
1315 
1316 #endif
1317 
1318 void
1319 m_extadd(struct mbuf *m, caddr_t buf, u_int size,  void (*reff)(void *),
1320     void (*freef)(void *), void *arg)
1321 {
1322 	m->m_ext.ext_arg = arg;
1323 	m->m_ext.ext_buf = buf;
1324 	m->m_ext.ext_ref = reff;
1325 	m->m_ext.ext_free = freef;
1326 	m->m_ext.ext_size = size;
1327 	reff(arg);
1328 	m->m_data = buf;
1329 	m->m_flags |= M_EXT;
1330 }
1331 
1332 /*
1333  * mbuf utility routines
1334  */
1335 
1336 /*
1337  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
1338  * copy junk along.
1339  */
1340 struct mbuf *
1341 m_prepend(struct mbuf *m, int len, int how)
1342 {
1343 	struct mbuf *mn;
1344 
1345 	if (m->m_flags & M_PKTHDR)
1346 	    mn = m_gethdr(how, m->m_type);
1347 	else
1348 	    mn = m_get(how, m->m_type);
1349 	if (mn == NULL) {
1350 		m_freem(m);
1351 		return (NULL);
1352 	}
1353 	if (m->m_flags & M_PKTHDR)
1354 		M_MOVE_PKTHDR(mn, m);
1355 	mn->m_next = m;
1356 	m = mn;
1357 	if (len < MHLEN)
1358 		MH_ALIGN(m, len);
1359 	m->m_len = len;
1360 	return (m);
1361 }
1362 
1363 /*
1364  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1365  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1366  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1367  * Note that the copy is read-only, because clusters are not copied,
1368  * only their reference counts are incremented.
1369  */
1370 struct mbuf *
1371 m_copym(const struct mbuf *m, int off0, int len, int wait)
1372 {
1373 	struct mbuf *n, **np;
1374 	int off = off0;
1375 	struct mbuf *top;
1376 	int copyhdr = 0;
1377 
1378 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1379 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1380 	if (off == 0 && (m->m_flags & M_PKTHDR))
1381 		copyhdr = 1;
1382 	while (off > 0) {
1383 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1384 		if (off < m->m_len)
1385 			break;
1386 		off -= m->m_len;
1387 		m = m->m_next;
1388 	}
1389 	np = &top;
1390 	top = NULL;
1391 	while (len > 0) {
1392 		if (m == NULL) {
1393 			KASSERT(len == M_COPYALL,
1394 			    ("m_copym, length > size of mbuf chain"));
1395 			break;
1396 		}
1397 		/*
1398 		 * Because we are sharing any cluster attachment below,
1399 		 * be sure to get an mbuf that does not have a cluster
1400 		 * associated with it.
1401 		 */
1402 		if (copyhdr)
1403 			n = m_gethdr(wait, m->m_type);
1404 		else
1405 			n = m_get(wait, m->m_type);
1406 		*np = n;
1407 		if (n == NULL)
1408 			goto nospace;
1409 		if (copyhdr) {
1410 			if (!m_dup_pkthdr(n, m, wait))
1411 				goto nospace;
1412 			if (len == M_COPYALL)
1413 				n->m_pkthdr.len -= off0;
1414 			else
1415 				n->m_pkthdr.len = len;
1416 			copyhdr = 0;
1417 		}
1418 		n->m_len = min(len, m->m_len - off);
1419 		if (m->m_flags & M_EXT) {
1420 			KKASSERT((n->m_flags & M_EXT) == 0);
1421 			n->m_data = m->m_data + off;
1422 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1423 			n->m_ext = m->m_ext;
1424 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1425 		} else {
1426 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1427 			    (unsigned)n->m_len);
1428 		}
1429 		if (len != M_COPYALL)
1430 			len -= n->m_len;
1431 		off = 0;
1432 		m = m->m_next;
1433 		np = &n->m_next;
1434 	}
1435 	if (top == NULL)
1436 		++mbstat[mycpu->gd_cpuid].m_mcfail;
1437 	return (top);
1438 nospace:
1439 	m_freem(top);
1440 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1441 	return (NULL);
1442 }
1443 
1444 /*
1445  * Copy an entire packet, including header (which must be present).
1446  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1447  * Note that the copy is read-only, because clusters are not copied,
1448  * only their reference counts are incremented.
1449  * Preserve alignment of the first mbuf so if the creator has left
1450  * some room at the beginning (e.g. for inserting protocol headers)
1451  * the copies also have the room available.
1452  */
1453 struct mbuf *
1454 m_copypacket(struct mbuf *m, int how)
1455 {
1456 	struct mbuf *top, *n, *o;
1457 
1458 	n = m_gethdr(how, m->m_type);
1459 	top = n;
1460 	if (!n)
1461 		goto nospace;
1462 
1463 	if (!m_dup_pkthdr(n, m, how))
1464 		goto nospace;
1465 	n->m_len = m->m_len;
1466 	if (m->m_flags & M_EXT) {
1467 		KKASSERT((n->m_flags & M_EXT) == 0);
1468 		n->m_data = m->m_data;
1469 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1470 		n->m_ext = m->m_ext;
1471 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1472 	} else {
1473 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1474 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1475 	}
1476 
1477 	m = m->m_next;
1478 	while (m) {
1479 		o = m_get(how, m->m_type);
1480 		if (!o)
1481 			goto nospace;
1482 
1483 		n->m_next = o;
1484 		n = n->m_next;
1485 
1486 		n->m_len = m->m_len;
1487 		if (m->m_flags & M_EXT) {
1488 			KKASSERT((n->m_flags & M_EXT) == 0);
1489 			n->m_data = m->m_data;
1490 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1491 			n->m_ext = m->m_ext;
1492 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1493 		} else {
1494 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1495 		}
1496 
1497 		m = m->m_next;
1498 	}
1499 	return top;
1500 nospace:
1501 	m_freem(top);
1502 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1503 	return (NULL);
1504 }
1505 
1506 /*
1507  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1508  * continuing for "len" bytes, into the indicated buffer.
1509  */
1510 void
1511 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1512 {
1513 	unsigned count;
1514 
1515 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1516 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1517 	while (off > 0) {
1518 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1519 		if (off < m->m_len)
1520 			break;
1521 		off -= m->m_len;
1522 		m = m->m_next;
1523 	}
1524 	while (len > 0) {
1525 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1526 		count = min(m->m_len - off, len);
1527 		bcopy(mtod(m, caddr_t) + off, cp, count);
1528 		len -= count;
1529 		cp += count;
1530 		off = 0;
1531 		m = m->m_next;
1532 	}
1533 }
1534 
1535 /*
1536  * Copy a packet header mbuf chain into a completely new chain, including
1537  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1538  * you need a writable copy of an mbuf chain.
1539  */
1540 struct mbuf *
1541 m_dup(struct mbuf *m, int how)
1542 {
1543 	struct mbuf **p, *top = NULL;
1544 	int remain, moff, nsize;
1545 
1546 	/* Sanity check */
1547 	if (m == NULL)
1548 		return (NULL);
1549 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1550 
1551 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1552 	remain = m->m_pkthdr.len;
1553 	moff = 0;
1554 	p = &top;
1555 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1556 		struct mbuf *n;
1557 
1558 		/* Get the next new mbuf */
1559 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1560 			   &nsize);
1561 		if (n == NULL)
1562 			goto nospace;
1563 		if (top == NULL)
1564 			if (!m_dup_pkthdr(n, m, how))
1565 				goto nospace0;
1566 
1567 		/* Link it into the new chain */
1568 		*p = n;
1569 		p = &n->m_next;
1570 
1571 		/* Copy data from original mbuf(s) into new mbuf */
1572 		n->m_len = 0;
1573 		while (n->m_len < nsize && m != NULL) {
1574 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1575 
1576 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1577 			moff += chunk;
1578 			n->m_len += chunk;
1579 			remain -= chunk;
1580 			if (moff == m->m_len) {
1581 				m = m->m_next;
1582 				moff = 0;
1583 			}
1584 		}
1585 
1586 		/* Check correct total mbuf length */
1587 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1588 			("%s: bogus m_pkthdr.len", __func__));
1589 	}
1590 	return (top);
1591 
1592 nospace:
1593 	m_freem(top);
1594 nospace0:
1595 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1596 	return (NULL);
1597 }
1598 
1599 /*
1600  * Copy the non-packet mbuf data chain into a new set of mbufs, including
1601  * copying any mbuf clusters.  This is typically used to realign a data
1602  * chain by nfs_realign().
1603  *
1604  * The original chain is left intact.  how should be MB_WAIT or MB_DONTWAIT
1605  * and NULL can be returned if MB_DONTWAIT is passed.
1606  *
1607  * Be careful to use cluster mbufs, a large mbuf chain converted to non
1608  * cluster mbufs can exhaust our supply of mbufs.
1609  */
1610 struct mbuf *
1611 m_dup_data(struct mbuf *m, int how)
1612 {
1613 	struct mbuf **p, *n, *top = NULL;
1614 	int mlen, moff, chunk, gsize, nsize;
1615 
1616 	/*
1617 	 * Degenerate case
1618 	 */
1619 	if (m == NULL)
1620 		return (NULL);
1621 
1622 	/*
1623 	 * Optimize the mbuf allocation but do not get too carried away.
1624 	 */
1625 	if (m->m_next || m->m_len > MLEN)
1626 		if (m->m_flags & M_EXT && m->m_ext.ext_size == MCLBYTES)
1627 			gsize = MCLBYTES;
1628 		else
1629 			gsize = MJUMPAGESIZE;
1630 	else
1631 		gsize = MLEN;
1632 
1633 	/* Chain control */
1634 	p = &top;
1635 	n = NULL;
1636 	nsize = 0;
1637 
1638 	/*
1639 	 * Scan the mbuf chain until nothing is left, the new mbuf chain
1640 	 * will be allocated on the fly as needed.
1641 	 */
1642 	while (m) {
1643 		mlen = m->m_len;
1644 		moff = 0;
1645 
1646 		while (mlen) {
1647 			KKASSERT(m->m_type == MT_DATA);
1648 			if (n == NULL) {
1649 				n = m_getl(gsize, how, MT_DATA, 0, &nsize);
1650 				n->m_len = 0;
1651 				if (n == NULL)
1652 					goto nospace;
1653 				*p = n;
1654 				p = &n->m_next;
1655 			}
1656 			chunk = imin(mlen, nsize);
1657 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1658 			mlen -= chunk;
1659 			moff += chunk;
1660 			n->m_len += chunk;
1661 			nsize -= chunk;
1662 			if (nsize == 0)
1663 				n = NULL;
1664 		}
1665 		m = m->m_next;
1666 	}
1667 	*p = NULL;
1668 	return(top);
1669 nospace:
1670 	*p = NULL;
1671 	m_freem(top);
1672 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1673 	return (NULL);
1674 }
1675 
1676 /*
1677  * Concatenate mbuf chain n to m.
1678  * Both chains must be of the same type (e.g. MT_DATA).
1679  * Any m_pkthdr is not updated.
1680  */
1681 void
1682 m_cat(struct mbuf *m, struct mbuf *n)
1683 {
1684 	m = m_last(m);
1685 	while (n) {
1686 		if (m->m_flags & M_EXT ||
1687 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1688 			/* just join the two chains */
1689 			m->m_next = n;
1690 			return;
1691 		}
1692 		/* splat the data from one into the other */
1693 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1694 		    (u_int)n->m_len);
1695 		m->m_len += n->m_len;
1696 		n = m_free(n);
1697 	}
1698 }
1699 
1700 void
1701 m_adj(struct mbuf *mp, int req_len)
1702 {
1703 	int len = req_len;
1704 	struct mbuf *m;
1705 	int count;
1706 
1707 	if ((m = mp) == NULL)
1708 		return;
1709 	if (len >= 0) {
1710 		/*
1711 		 * Trim from head.
1712 		 */
1713 		while (m != NULL && len > 0) {
1714 			if (m->m_len <= len) {
1715 				len -= m->m_len;
1716 				m->m_len = 0;
1717 				m = m->m_next;
1718 			} else {
1719 				m->m_len -= len;
1720 				m->m_data += len;
1721 				len = 0;
1722 			}
1723 		}
1724 		m = mp;
1725 		if (mp->m_flags & M_PKTHDR)
1726 			m->m_pkthdr.len -= (req_len - len);
1727 	} else {
1728 		/*
1729 		 * Trim from tail.  Scan the mbuf chain,
1730 		 * calculating its length and finding the last mbuf.
1731 		 * If the adjustment only affects this mbuf, then just
1732 		 * adjust and return.  Otherwise, rescan and truncate
1733 		 * after the remaining size.
1734 		 */
1735 		len = -len;
1736 		count = 0;
1737 		for (;;) {
1738 			count += m->m_len;
1739 			if (m->m_next == NULL)
1740 				break;
1741 			m = m->m_next;
1742 		}
1743 		if (m->m_len >= len) {
1744 			m->m_len -= len;
1745 			if (mp->m_flags & M_PKTHDR)
1746 				mp->m_pkthdr.len -= len;
1747 			return;
1748 		}
1749 		count -= len;
1750 		if (count < 0)
1751 			count = 0;
1752 		/*
1753 		 * Correct length for chain is "count".
1754 		 * Find the mbuf with last data, adjust its length,
1755 		 * and toss data from remaining mbufs on chain.
1756 		 */
1757 		m = mp;
1758 		if (m->m_flags & M_PKTHDR)
1759 			m->m_pkthdr.len = count;
1760 		for (; m; m = m->m_next) {
1761 			if (m->m_len >= count) {
1762 				m->m_len = count;
1763 				break;
1764 			}
1765 			count -= m->m_len;
1766 		}
1767 		while (m->m_next)
1768 			(m = m->m_next) ->m_len = 0;
1769 	}
1770 }
1771 
1772 /*
1773  * Set the m_data pointer of a newly-allocated mbuf
1774  * to place an object of the specified size at the
1775  * end of the mbuf, longword aligned.
1776  */
1777 void
1778 m_align(struct mbuf *m, int len)
1779 {
1780 	int adjust;
1781 
1782 	if (m->m_flags & M_EXT)
1783 		adjust = m->m_ext.ext_size - len;
1784 	else if (m->m_flags & M_PKTHDR)
1785 		adjust = MHLEN - len;
1786 	else
1787 		adjust = MLEN - len;
1788 	m->m_data += adjust &~ (sizeof(long)-1);
1789 }
1790 
1791 /*
1792  * Create a writable copy of the mbuf chain.  While doing this
1793  * we compact the chain with a goal of producing a chain with
1794  * at most two mbufs.  The second mbuf in this chain is likely
1795  * to be a cluster.  The primary purpose of this work is to create
1796  * a writable packet for encryption, compression, etc.  The
1797  * secondary goal is to linearize the data so the data can be
1798  * passed to crypto hardware in the most efficient manner possible.
1799  */
1800 struct mbuf *
1801 m_unshare(struct mbuf *m0, int how)
1802 {
1803 	struct mbuf *m, *mprev;
1804 	struct mbuf *n, *mfirst, *mlast;
1805 	int len, off;
1806 
1807 	mprev = NULL;
1808 	for (m = m0; m != NULL; m = mprev->m_next) {
1809 		/*
1810 		 * Regular mbufs are ignored unless there's a cluster
1811 		 * in front of it that we can use to coalesce.  We do
1812 		 * the latter mainly so later clusters can be coalesced
1813 		 * also w/o having to handle them specially (i.e. convert
1814 		 * mbuf+cluster -> cluster).  This optimization is heavily
1815 		 * influenced by the assumption that we're running over
1816 		 * Ethernet where MCLBYTES is large enough that the max
1817 		 * packet size will permit lots of coalescing into a
1818 		 * single cluster.  This in turn permits efficient
1819 		 * crypto operations, especially when using hardware.
1820 		 */
1821 		if ((m->m_flags & M_EXT) == 0) {
1822 			if (mprev && (mprev->m_flags & M_EXT) &&
1823 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
1824 				/* XXX: this ignores mbuf types */
1825 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1826 				       mtod(m, caddr_t), m->m_len);
1827 				mprev->m_len += m->m_len;
1828 				mprev->m_next = m->m_next;	/* unlink from chain */
1829 				m_free(m);			/* reclaim mbuf */
1830 			} else {
1831 				mprev = m;
1832 			}
1833 			continue;
1834 		}
1835 		/*
1836 		 * Writable mbufs are left alone (for now).
1837 		 */
1838 		if (M_WRITABLE(m)) {
1839 			mprev = m;
1840 			continue;
1841 		}
1842 
1843 		/*
1844 		 * Not writable, replace with a copy or coalesce with
1845 		 * the previous mbuf if possible (since we have to copy
1846 		 * it anyway, we try to reduce the number of mbufs and
1847 		 * clusters so that future work is easier).
1848 		 */
1849 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
1850 		/* NB: we only coalesce into a cluster or larger */
1851 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
1852 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
1853 			/* XXX: this ignores mbuf types */
1854 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1855 			       mtod(m, caddr_t), m->m_len);
1856 			mprev->m_len += m->m_len;
1857 			mprev->m_next = m->m_next;	/* unlink from chain */
1858 			m_free(m);			/* reclaim mbuf */
1859 			continue;
1860 		}
1861 
1862 		/*
1863 		 * Allocate new space to hold the copy...
1864 		 */
1865 		/* XXX why can M_PKTHDR be set past the first mbuf? */
1866 		if (mprev == NULL && (m->m_flags & M_PKTHDR)) {
1867 			/*
1868 			 * NB: if a packet header is present we must
1869 			 * allocate the mbuf separately from any cluster
1870 			 * because M_MOVE_PKTHDR will smash the data
1871 			 * pointer and drop the M_EXT marker.
1872 			 */
1873 			MGETHDR(n, how, m->m_type);
1874 			if (n == NULL) {
1875 				m_freem(m0);
1876 				return (NULL);
1877 			}
1878 			M_MOVE_PKTHDR(n, m);
1879 			MCLGET(n, how);
1880 			if ((n->m_flags & M_EXT) == 0) {
1881 				m_free(n);
1882 				m_freem(m0);
1883 				return (NULL);
1884 			}
1885 		} else {
1886 			n = m_getcl(how, m->m_type, m->m_flags);
1887 			if (n == NULL) {
1888 				m_freem(m0);
1889 				return (NULL);
1890 			}
1891 		}
1892 		/*
1893 		 * ... and copy the data.  We deal with jumbo mbufs
1894 		 * (i.e. m_len > MCLBYTES) by splitting them into
1895 		 * clusters.  We could just malloc a buffer and make
1896 		 * it external but too many device drivers don't know
1897 		 * how to break up the non-contiguous memory when
1898 		 * doing DMA.
1899 		 */
1900 		len = m->m_len;
1901 		off = 0;
1902 		mfirst = n;
1903 		mlast = NULL;
1904 		for (;;) {
1905 			int cc = min(len, MCLBYTES);
1906 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
1907 			n->m_len = cc;
1908 			if (mlast != NULL)
1909 				mlast->m_next = n;
1910 			mlast = n;
1911 
1912 			len -= cc;
1913 			if (len <= 0)
1914 				break;
1915 			off += cc;
1916 
1917 			n = m_getcl(how, m->m_type, m->m_flags);
1918 			if (n == NULL) {
1919 				m_freem(mfirst);
1920 				m_freem(m0);
1921 				return (NULL);
1922 			}
1923 		}
1924 		n->m_next = m->m_next;
1925 		if (mprev == NULL)
1926 			m0 = mfirst;		/* new head of chain */
1927 		else
1928 			mprev->m_next = mfirst;	/* replace old mbuf */
1929 		m_free(m);			/* release old mbuf */
1930 		mprev = mfirst;
1931 	}
1932 	return (m0);
1933 }
1934 
1935 /*
1936  * Rearrange an mbuf chain so that len bytes are contiguous
1937  * and in the data area of an mbuf (so that mtod will work for a structure
1938  * of size len).  Returns the resulting mbuf chain on success, frees it and
1939  * returns null on failure.  If there is room, it will add up to
1940  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1941  * avoid being called next time.
1942  */
1943 struct mbuf *
1944 m_pullup(struct mbuf *n, int len)
1945 {
1946 	struct mbuf *m;
1947 	int count;
1948 	int space;
1949 
1950 	/*
1951 	 * If first mbuf has no cluster, and has room for len bytes
1952 	 * without shifting current data, pullup into it,
1953 	 * otherwise allocate a new mbuf to prepend to the chain.
1954 	 */
1955 	if (!(n->m_flags & M_EXT) &&
1956 	    n->m_data + len < &n->m_dat[MLEN] &&
1957 	    n->m_next) {
1958 		if (n->m_len >= len)
1959 			return (n);
1960 		m = n;
1961 		n = n->m_next;
1962 		len -= m->m_len;
1963 	} else {
1964 		if (len > MHLEN)
1965 			goto bad;
1966 		if (n->m_flags & M_PKTHDR)
1967 			m = m_gethdr(MB_DONTWAIT, n->m_type);
1968 		else
1969 			m = m_get(MB_DONTWAIT, n->m_type);
1970 		if (m == NULL)
1971 			goto bad;
1972 		m->m_len = 0;
1973 		if (n->m_flags & M_PKTHDR)
1974 			M_MOVE_PKTHDR(m, n);
1975 	}
1976 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1977 	do {
1978 		count = min(min(max(len, max_protohdr), space), n->m_len);
1979 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1980 		  (unsigned)count);
1981 		len -= count;
1982 		m->m_len += count;
1983 		n->m_len -= count;
1984 		space -= count;
1985 		if (n->m_len)
1986 			n->m_data += count;
1987 		else
1988 			n = m_free(n);
1989 	} while (len > 0 && n);
1990 	if (len > 0) {
1991 		m_free(m);
1992 		goto bad;
1993 	}
1994 	m->m_next = n;
1995 	return (m);
1996 bad:
1997 	m_freem(n);
1998 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1999 	return (NULL);
2000 }
2001 
2002 /*
2003  * Partition an mbuf chain in two pieces, returning the tail --
2004  * all but the first len0 bytes.  In case of failure, it returns NULL and
2005  * attempts to restore the chain to its original state.
2006  *
2007  * Note that the resulting mbufs might be read-only, because the new
2008  * mbuf can end up sharing an mbuf cluster with the original mbuf if
2009  * the "breaking point" happens to lie within a cluster mbuf. Use the
2010  * M_WRITABLE() macro to check for this case.
2011  */
2012 struct mbuf *
2013 m_split(struct mbuf *m0, int len0, int wait)
2014 {
2015 	struct mbuf *m, *n;
2016 	unsigned len = len0, remain;
2017 
2018 	for (m = m0; m && len > m->m_len; m = m->m_next)
2019 		len -= m->m_len;
2020 	if (m == NULL)
2021 		return (NULL);
2022 	remain = m->m_len - len;
2023 	if (m0->m_flags & M_PKTHDR) {
2024 		n = m_gethdr(wait, m0->m_type);
2025 		if (n == NULL)
2026 			return (NULL);
2027 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
2028 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
2029 		m0->m_pkthdr.len = len0;
2030 		if (m->m_flags & M_EXT)
2031 			goto extpacket;
2032 		if (remain > MHLEN) {
2033 			/* m can't be the lead packet */
2034 			MH_ALIGN(n, 0);
2035 			n->m_next = m_split(m, len, wait);
2036 			if (n->m_next == NULL) {
2037 				m_free(n);
2038 				return (NULL);
2039 			} else {
2040 				n->m_len = 0;
2041 				return (n);
2042 			}
2043 		} else
2044 			MH_ALIGN(n, remain);
2045 	} else if (remain == 0) {
2046 		n = m->m_next;
2047 		m->m_next = NULL;
2048 		return (n);
2049 	} else {
2050 		n = m_get(wait, m->m_type);
2051 		if (n == NULL)
2052 			return (NULL);
2053 		M_ALIGN(n, remain);
2054 	}
2055 extpacket:
2056 	if (m->m_flags & M_EXT) {
2057 		KKASSERT((n->m_flags & M_EXT) == 0);
2058 		n->m_data = m->m_data + len;
2059 		m->m_ext.ext_ref(m->m_ext.ext_arg);
2060 		n->m_ext = m->m_ext;
2061 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
2062 	} else {
2063 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
2064 	}
2065 	n->m_len = remain;
2066 	m->m_len = len;
2067 	n->m_next = m->m_next;
2068 	m->m_next = NULL;
2069 	return (n);
2070 }
2071 
2072 /*
2073  * Routine to copy from device local memory into mbufs.
2074  * Note: "offset" is ill-defined and always called as 0, so ignore it.
2075  */
2076 struct mbuf *
2077 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
2078     void (*copy)(volatile const void *from, volatile void *to, size_t length))
2079 {
2080 	struct mbuf *m, *mfirst = NULL, **mtail;
2081 	int nsize, flags;
2082 
2083 	if (copy == NULL)
2084 		copy = bcopy;
2085 	mtail = &mfirst;
2086 	flags = M_PKTHDR;
2087 
2088 	while (len > 0) {
2089 		m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
2090 		if (m == NULL) {
2091 			m_freem(mfirst);
2092 			return (NULL);
2093 		}
2094 		m->m_len = min(len, nsize);
2095 
2096 		if (flags & M_PKTHDR) {
2097 			if (len + max_linkhdr <= nsize)
2098 				m->m_data += max_linkhdr;
2099 			m->m_pkthdr.rcvif = ifp;
2100 			m->m_pkthdr.len = len;
2101 			flags = 0;
2102 		}
2103 
2104 		copy(buf, m->m_data, (unsigned)m->m_len);
2105 		buf += m->m_len;
2106 		len -= m->m_len;
2107 		*mtail = m;
2108 		mtail = &m->m_next;
2109 	}
2110 
2111 	return (mfirst);
2112 }
2113 
2114 /*
2115  * Routine to pad mbuf to the specified length 'padto'.
2116  */
2117 int
2118 m_devpad(struct mbuf *m, int padto)
2119 {
2120 	struct mbuf *last = NULL;
2121 	int padlen;
2122 
2123 	if (padto <= m->m_pkthdr.len)
2124 		return 0;
2125 
2126 	padlen = padto - m->m_pkthdr.len;
2127 
2128 	/* if there's only the packet-header and we can pad there, use it. */
2129 	if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) {
2130 		last = m;
2131 	} else {
2132 		/*
2133 		 * Walk packet chain to find last mbuf. We will either
2134 		 * pad there, or append a new mbuf and pad it
2135 		 */
2136 		for (last = m; last->m_next != NULL; last = last->m_next)
2137 			; /* EMPTY */
2138 
2139 		/* `last' now points to last in chain. */
2140 		if (M_TRAILINGSPACE(last) < padlen) {
2141 			struct mbuf *n;
2142 
2143 			/* Allocate new empty mbuf, pad it.  Compact later. */
2144 			MGET(n, MB_DONTWAIT, MT_DATA);
2145 			if (n == NULL)
2146 				return ENOBUFS;
2147 			n->m_len = 0;
2148 			last->m_next = n;
2149 			last = n;
2150 		}
2151 	}
2152 	KKASSERT(M_TRAILINGSPACE(last) >= padlen);
2153 	KKASSERT(M_WRITABLE(last));
2154 
2155 	/* Now zero the pad area */
2156 	bzero(mtod(last, char *) + last->m_len, padlen);
2157 	last->m_len += padlen;
2158 	m->m_pkthdr.len += padlen;
2159 	return 0;
2160 }
2161 
2162 /*
2163  * Copy data from a buffer back into the indicated mbuf chain,
2164  * starting "off" bytes from the beginning, extending the mbuf
2165  * chain if necessary.
2166  */
2167 void
2168 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
2169 {
2170 	int mlen;
2171 	struct mbuf *m = m0, *n;
2172 	int totlen = 0;
2173 
2174 	if (m0 == NULL)
2175 		return;
2176 	while (off > (mlen = m->m_len)) {
2177 		off -= mlen;
2178 		totlen += mlen;
2179 		if (m->m_next == NULL) {
2180 			n = m_getclr(MB_DONTWAIT, m->m_type);
2181 			if (n == NULL)
2182 				goto out;
2183 			n->m_len = min(MLEN, len + off);
2184 			m->m_next = n;
2185 		}
2186 		m = m->m_next;
2187 	}
2188 	while (len > 0) {
2189 		mlen = min (m->m_len - off, len);
2190 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
2191 		cp += mlen;
2192 		len -= mlen;
2193 		mlen += off;
2194 		off = 0;
2195 		totlen += mlen;
2196 		if (len == 0)
2197 			break;
2198 		if (m->m_next == NULL) {
2199 			n = m_get(MB_DONTWAIT, m->m_type);
2200 			if (n == NULL)
2201 				break;
2202 			n->m_len = min(MLEN, len);
2203 			m->m_next = n;
2204 		}
2205 		m = m->m_next;
2206 	}
2207 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
2208 		m->m_pkthdr.len = totlen;
2209 }
2210 
2211 /*
2212  * Append the specified data to the indicated mbuf chain,
2213  * Extend the mbuf chain if the new data does not fit in
2214  * existing space.
2215  *
2216  * Return 1 if able to complete the job; otherwise 0.
2217  */
2218 int
2219 m_append(struct mbuf *m0, int len, c_caddr_t cp)
2220 {
2221 	struct mbuf *m, *n;
2222 	int remainder, space;
2223 
2224 	for (m = m0; m->m_next != NULL; m = m->m_next)
2225 		;
2226 	remainder = len;
2227 	space = M_TRAILINGSPACE(m);
2228 	if (space > 0) {
2229 		/*
2230 		 * Copy into available space.
2231 		 */
2232 		if (space > remainder)
2233 			space = remainder;
2234 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2235 		m->m_len += space;
2236 		cp += space, remainder -= space;
2237 	}
2238 	while (remainder > 0) {
2239 		/*
2240 		 * Allocate a new mbuf; could check space
2241 		 * and allocate a cluster instead.
2242 		 */
2243 		n = m_get(MB_DONTWAIT, m->m_type);
2244 		if (n == NULL)
2245 			break;
2246 		n->m_len = min(MLEN, remainder);
2247 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2248 		cp += n->m_len, remainder -= n->m_len;
2249 		m->m_next = n;
2250 		m = n;
2251 	}
2252 	if (m0->m_flags & M_PKTHDR)
2253 		m0->m_pkthdr.len += len - remainder;
2254 	return (remainder == 0);
2255 }
2256 
2257 /*
2258  * Apply function f to the data in an mbuf chain starting "off" bytes from
2259  * the beginning, continuing for "len" bytes.
2260  */
2261 int
2262 m_apply(struct mbuf *m, int off, int len,
2263     int (*f)(void *, void *, u_int), void *arg)
2264 {
2265 	u_int count;
2266 	int rval;
2267 
2268 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
2269 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
2270 	while (off > 0) {
2271 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2272 		if (off < m->m_len)
2273 			break;
2274 		off -= m->m_len;
2275 		m = m->m_next;
2276 	}
2277 	while (len > 0) {
2278 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2279 		count = min(m->m_len - off, len);
2280 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
2281 		if (rval)
2282 			return (rval);
2283 		len -= count;
2284 		off = 0;
2285 		m = m->m_next;
2286 	}
2287 	return (0);
2288 }
2289 
2290 /*
2291  * Return a pointer to mbuf/offset of location in mbuf chain.
2292  */
2293 struct mbuf *
2294 m_getptr(struct mbuf *m, int loc, int *off)
2295 {
2296 
2297 	while (loc >= 0) {
2298 		/* Normal end of search. */
2299 		if (m->m_len > loc) {
2300 			*off = loc;
2301 			return (m);
2302 		} else {
2303 			loc -= m->m_len;
2304 			if (m->m_next == NULL) {
2305 				if (loc == 0) {
2306 					/* Point at the end of valid data. */
2307 					*off = m->m_len;
2308 					return (m);
2309 				}
2310 				return (NULL);
2311 			}
2312 			m = m->m_next;
2313 		}
2314 	}
2315 	return (NULL);
2316 }
2317 
2318 void
2319 m_print(const struct mbuf *m)
2320 {
2321 	int len;
2322 	const struct mbuf *m2;
2323 	char *hexstr;
2324 
2325 	len = m->m_pkthdr.len;
2326 	m2 = m;
2327 	hexstr = kmalloc(HEX_NCPYLEN(len), M_TEMP, M_ZERO | M_WAITOK);
2328 	while (len) {
2329 		kprintf("%p %s\n", m2, hexncpy(m2->m_data, m2->m_len, hexstr,
2330 			HEX_NCPYLEN(m2->m_len), "-"));
2331 		len -= m2->m_len;
2332 		m2 = m2->m_next;
2333 	}
2334 	kfree(hexstr, M_TEMP);
2335 	return;
2336 }
2337 
2338 /*
2339  * "Move" mbuf pkthdr from "from" to "to".
2340  * "from" must have M_PKTHDR set, and "to" must be empty.
2341  */
2342 void
2343 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
2344 {
2345 	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
2346 
2347 	to->m_flags |= from->m_flags & M_COPYFLAGS;
2348 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
2349 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
2350 }
2351 
2352 /*
2353  * Duplicate "from"'s mbuf pkthdr in "to".
2354  * "from" must have M_PKTHDR set, and "to" must be empty.
2355  * In particular, this does a deep copy of the packet tags.
2356  */
2357 int
2358 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
2359 {
2360 	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
2361 
2362 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
2363 		      (to->m_flags & ~M_COPYFLAGS);
2364 	to->m_pkthdr = from->m_pkthdr;
2365 	SLIST_INIT(&to->m_pkthdr.tags);
2366 	return (m_tag_copy_chain(to, from, how));
2367 }
2368 
2369 /*
2370  * Defragment a mbuf chain, returning the shortest possible
2371  * chain of mbufs and clusters.  If allocation fails and
2372  * this cannot be completed, NULL will be returned, but
2373  * the passed in chain will be unchanged.  Upon success,
2374  * the original chain will be freed, and the new chain
2375  * will be returned.
2376  *
2377  * If a non-packet header is passed in, the original
2378  * mbuf (chain?) will be returned unharmed.
2379  *
2380  * m_defrag_nofree doesn't free the passed in mbuf.
2381  */
2382 struct mbuf *
2383 m_defrag(struct mbuf *m0, int how)
2384 {
2385 	struct mbuf *m_new;
2386 
2387 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
2388 		return (NULL);
2389 	if (m_new != m0)
2390 		m_freem(m0);
2391 	return (m_new);
2392 }
2393 
2394 struct mbuf *
2395 m_defrag_nofree(struct mbuf *m0, int how)
2396 {
2397 	struct mbuf	*m_new = NULL, *m_final = NULL;
2398 	int		progress = 0, length, nsize;
2399 
2400 	if (!(m0->m_flags & M_PKTHDR))
2401 		return (m0);
2402 
2403 #ifdef MBUF_STRESS_TEST
2404 	if (m_defragrandomfailures) {
2405 		int temp = karc4random() & 0xff;
2406 		if (temp == 0xba)
2407 			goto nospace;
2408 	}
2409 #endif
2410 
2411 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
2412 	if (m_final == NULL)
2413 		goto nospace;
2414 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
2415 
2416 	if (m_dup_pkthdr(m_final, m0, how) == 0)
2417 		goto nospace;
2418 
2419 	m_new = m_final;
2420 
2421 	while (progress < m0->m_pkthdr.len) {
2422 		length = m0->m_pkthdr.len - progress;
2423 		if (length > MCLBYTES)
2424 			length = MCLBYTES;
2425 
2426 		if (m_new == NULL) {
2427 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
2428 			if (m_new == NULL)
2429 				goto nospace;
2430 		}
2431 
2432 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
2433 		progress += length;
2434 		m_new->m_len = length;
2435 		if (m_new != m_final)
2436 			m_cat(m_final, m_new);
2437 		m_new = NULL;
2438 	}
2439 	if (m0->m_next == NULL)
2440 		m_defraguseless++;
2441 	m_defragpackets++;
2442 	m_defragbytes += m_final->m_pkthdr.len;
2443 	return (m_final);
2444 nospace:
2445 	m_defragfailure++;
2446 	if (m_new)
2447 		m_free(m_new);
2448 	m_freem(m_final);
2449 	return (NULL);
2450 }
2451 
2452 /*
2453  * Move data from uio into mbufs.
2454  */
2455 struct mbuf *
2456 m_uiomove(struct uio *uio)
2457 {
2458 	struct mbuf *m;			/* current working mbuf */
2459 	struct mbuf *head = NULL;	/* result mbuf chain */
2460 	struct mbuf **mp = &head;
2461 	int flags = M_PKTHDR;
2462 	int nsize;
2463 	int error;
2464 	int resid;
2465 
2466 	do {
2467 		if (uio->uio_resid > INT_MAX)
2468 			resid = INT_MAX;
2469 		else
2470 			resid = (int)uio->uio_resid;
2471 		m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
2472 		if (flags) {
2473 			m->m_pkthdr.len = 0;
2474 			/* Leave room for protocol headers. */
2475 			if (resid < MHLEN)
2476 				MH_ALIGN(m, resid);
2477 			flags = 0;
2478 		}
2479 		m->m_len = imin(nsize, resid);
2480 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
2481 		if (error) {
2482 			m_free(m);
2483 			goto failed;
2484 		}
2485 		*mp = m;
2486 		mp = &m->m_next;
2487 		head->m_pkthdr.len += m->m_len;
2488 	} while (uio->uio_resid > 0);
2489 
2490 	return (head);
2491 
2492 failed:
2493 	m_freem(head);
2494 	return (NULL);
2495 }
2496 
2497 struct mbuf *
2498 m_last(struct mbuf *m)
2499 {
2500 	while (m->m_next)
2501 		m = m->m_next;
2502 	return (m);
2503 }
2504 
2505 /*
2506  * Return the number of bytes in an mbuf chain.
2507  * If lastm is not NULL, also return the last mbuf.
2508  */
2509 u_int
2510 m_lengthm(struct mbuf *m, struct mbuf **lastm)
2511 {
2512 	u_int len = 0;
2513 	struct mbuf *prev = m;
2514 
2515 	while (m) {
2516 		len += m->m_len;
2517 		prev = m;
2518 		m = m->m_next;
2519 	}
2520 	if (lastm != NULL)
2521 		*lastm = prev;
2522 	return (len);
2523 }
2524 
2525 /*
2526  * Like m_lengthm(), except also keep track of mbuf usage.
2527  */
2528 u_int
2529 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
2530 {
2531 	u_int len = 0, mbcnt = 0;
2532 	struct mbuf *prev = m;
2533 
2534 	while (m) {
2535 		len += m->m_len;
2536 		mbcnt += MSIZE;
2537 		if (m->m_flags & M_EXT)
2538 			mbcnt += m->m_ext.ext_size;
2539 		prev = m;
2540 		m = m->m_next;
2541 	}
2542 	if (lastm != NULL)
2543 		*lastm = prev;
2544 	*pmbcnt = mbcnt;
2545 	return (len);
2546 }
2547