xref: /dragonfly/sys/kern/uipc_mbuf.c (revision 926deccb)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
5  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Jeffrey M. Hsu.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * Copyright (c) 1982, 1986, 1988, 1991, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  *
64  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
65  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
66  */
67 
68 #include "opt_param.h"
69 #include "opt_mbuf_stress_test.h"
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/file.h>
73 #include <sys/malloc.h>
74 #include <sys/mbuf.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/domain.h>
78 #include <sys/objcache.h>
79 #include <sys/tree.h>
80 #include <sys/protosw.h>
81 #include <sys/uio.h>
82 #include <sys/thread.h>
83 #include <sys/globaldata.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/spinlock2.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/limits.h>
90 
91 #include <vm/vm.h>
92 #include <vm/vm_kern.h>
93 #include <vm/vm_extern.h>
94 
95 #ifdef INVARIANTS
96 #include <machine/cpu.h>
97 #endif
98 
99 /*
100  * mbuf cluster meta-data
101  */
102 struct mbcluster {
103 	int32_t	mcl_refs;
104 	void	*mcl_data;
105 };
106 
107 /*
108  * mbuf tracking for debugging purposes
109  */
110 #ifdef MBUF_DEBUG
111 
112 static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack");
113 
114 struct mbctrack;
115 RB_HEAD(mbuf_rb_tree, mbtrack);
116 RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *);
117 
118 struct mbtrack {
119 	RB_ENTRY(mbtrack) rb_node;
120 	int trackid;
121 	struct mbuf *m;
122 };
123 
124 static int
125 mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2)
126 {
127 	if (mb1->m < mb2->m)
128 		return(-1);
129 	if (mb1->m > mb2->m)
130 		return(1);
131 	return(0);
132 }
133 
134 RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m);
135 
136 struct mbuf_rb_tree	mbuf_track_root;
137 static struct spinlock	mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin);
138 
139 static void
140 mbuftrack(struct mbuf *m)
141 {
142 	struct mbtrack *mbt;
143 
144 	mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO);
145 	spin_lock(&mbuf_track_spin);
146 	mbt->m = m;
147 	if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) {
148 		spin_unlock(&mbuf_track_spin);
149 		panic("mbuftrack: mbuf %p already being tracked", m);
150 	}
151 	spin_unlock(&mbuf_track_spin);
152 }
153 
154 static void
155 mbufuntrack(struct mbuf *m)
156 {
157 	struct mbtrack *mbt;
158 
159 	spin_lock(&mbuf_track_spin);
160 	mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
161 	if (mbt == NULL) {
162 		spin_unlock(&mbuf_track_spin);
163 		panic("mbufuntrack: mbuf %p was not tracked", m);
164 	} else {
165 		mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt);
166 		spin_unlock(&mbuf_track_spin);
167 		kfree(mbt, M_MTRACK);
168 	}
169 }
170 
171 void
172 mbuftrackid(struct mbuf *m, int trackid)
173 {
174 	struct mbtrack *mbt;
175 	struct mbuf *n;
176 
177 	spin_lock(&mbuf_track_spin);
178 	while (m) {
179 		n = m->m_nextpkt;
180 		while (m) {
181 			mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
182 			if (mbt == NULL) {
183 				spin_unlock(&mbuf_track_spin);
184 				panic("mbuftrackid: mbuf %p not tracked", m);
185 			}
186 			mbt->trackid = trackid;
187 			m = m->m_next;
188 		}
189 		m = n;
190 	}
191 	spin_unlock(&mbuf_track_spin);
192 }
193 
194 static int
195 mbuftrack_callback(struct mbtrack *mbt, void *arg)
196 {
197 	struct sysctl_req *req = arg;
198 	char buf[64];
199 	int error;
200 
201 	ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid);
202 
203 	spin_unlock(&mbuf_track_spin);
204 	error = SYSCTL_OUT(req, buf, strlen(buf));
205 	spin_lock(&mbuf_track_spin);
206 	if (error)
207 		return(-error);
208 	return(0);
209 }
210 
211 static int
212 mbuftrack_show(SYSCTL_HANDLER_ARGS)
213 {
214 	int error;
215 
216 	spin_lock(&mbuf_track_spin);
217 	error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL,
218 				     mbuftrack_callback, req);
219 	spin_unlock(&mbuf_track_spin);
220 	return (-error);
221 }
222 SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING,
223 	    0, 0, mbuftrack_show, "A", "Show all in-use mbufs");
224 
225 #else
226 
227 #define mbuftrack(m)
228 #define mbufuntrack(m)
229 
230 #endif
231 
232 static void mbinit(void *);
233 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
234 
235 struct mbtypes_stat {
236 	u_long	stats[MT_NTYPES];
237 } __cachealign;
238 
239 static struct mbtypes_stat	mbtypes[SMP_MAXCPU];
240 
241 static struct mbstat mbstat[SMP_MAXCPU] __cachealign;
242 int	max_linkhdr;
243 int	max_protohdr;
244 int	max_hdr;
245 int	max_datalen;
246 int	m_defragpackets;
247 int	m_defragbytes;
248 int	m_defraguseless;
249 int	m_defragfailure;
250 #ifdef MBUF_STRESS_TEST
251 int	m_defragrandomfailures;
252 #endif
253 
254 struct objcache *mbuf_cache, *mbufphdr_cache;
255 struct objcache *mclmeta_cache, *mjclmeta_cache;
256 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
257 struct objcache *mbufjcluster_cache, *mbufphdrjcluster_cache;
258 
259 int		nmbclusters;
260 static int	nmbjclusters;
261 int		nmbufs;
262 
263 static int	mclph_cachefrac;
264 static int	mcl_cachefrac;
265 
266 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
267 	&max_linkhdr, 0, "Max size of a link-level header");
268 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
269 	&max_protohdr, 0, "Max size of a protocol header");
270 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0,
271 	"Max size of link+protocol headers");
272 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
273 	&max_datalen, 0, "Max data payload size without headers");
274 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
275 	&mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations");
276 static int do_mbstat(SYSCTL_HANDLER_ARGS);
277 
278 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD,
279 	0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics");
280 
281 static int do_mbtypes(SYSCTL_HANDLER_ARGS);
282 
283 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD,
284 	0, 0, do_mbtypes, "LU", "");
285 
286 static int
287 do_mbstat(SYSCTL_HANDLER_ARGS)
288 {
289 	struct mbstat mbstat_total;
290 	struct mbstat *mbstat_totalp;
291 	int i;
292 
293 	bzero(&mbstat_total, sizeof(mbstat_total));
294 	mbstat_totalp = &mbstat_total;
295 
296 	for (i = 0; i < ncpus; i++)
297 	{
298 		mbstat_total.m_mbufs += mbstat[i].m_mbufs;
299 		mbstat_total.m_clusters += mbstat[i].m_clusters;
300 		mbstat_total.m_jclusters += mbstat[i].m_jclusters;
301 		mbstat_total.m_clfree += mbstat[i].m_clfree;
302 		mbstat_total.m_drops += mbstat[i].m_drops;
303 		mbstat_total.m_wait += mbstat[i].m_wait;
304 		mbstat_total.m_drain += mbstat[i].m_drain;
305 		mbstat_total.m_mcfail += mbstat[i].m_mcfail;
306 		mbstat_total.m_mpfail += mbstat[i].m_mpfail;
307 
308 	}
309 	/*
310 	 * The following fields are not cumulative fields so just
311 	 * get their values once.
312 	 */
313 	mbstat_total.m_msize = mbstat[0].m_msize;
314 	mbstat_total.m_mclbytes = mbstat[0].m_mclbytes;
315 	mbstat_total.m_minclsize = mbstat[0].m_minclsize;
316 	mbstat_total.m_mlen = mbstat[0].m_mlen;
317 	mbstat_total.m_mhlen = mbstat[0].m_mhlen;
318 
319 	return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req));
320 }
321 
322 static int
323 do_mbtypes(SYSCTL_HANDLER_ARGS)
324 {
325 	u_long totals[MT_NTYPES];
326 	int i, j;
327 
328 	for (i = 0; i < MT_NTYPES; i++)
329 		totals[i] = 0;
330 
331 	for (i = 0; i < ncpus; i++)
332 	{
333 		for (j = 0; j < MT_NTYPES; j++)
334 			totals[j] += mbtypes[i].stats[j];
335 	}
336 
337 	return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req));
338 }
339 
340 /*
341  * These are read-only because we do not currently have any code
342  * to adjust the objcache limits after the fact.  The variables
343  * may only be set as boot-time tunables.
344  */
345 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
346 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
347 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
348 	   "Maximum number of mbufs available");
349 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjclusters, CTLFLAG_RD, &nmbjclusters, 0,
350 	   "Maximum number of mbuf jclusters available");
351 SYSCTL_INT(_kern_ipc, OID_AUTO, mclph_cachefrac, CTLFLAG_RD,
352     	   &mclph_cachefrac, 0,
353 	   "Fraction of cacheable mbuf clusters w/ pkthdr");
354 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_cachefrac, CTLFLAG_RD,
355     	   &mcl_cachefrac, 0, "Fraction of cacheable mbuf clusters");
356 
357 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
358 	   &m_defragpackets, 0, "Number of defragment packets");
359 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
360 	   &m_defragbytes, 0, "Number of defragment bytes");
361 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
362 	   &m_defraguseless, 0, "Number of useless defragment mbuf chain operations");
363 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
364 	   &m_defragfailure, 0, "Number of failed defragment mbuf chain operations");
365 #ifdef MBUF_STRESS_TEST
366 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
367 	   &m_defragrandomfailures, 0, "");
368 #endif
369 
370 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
371 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
372 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
373 
374 static void m_reclaim (void);
375 static void m_mclref(void *arg);
376 static void m_mclfree(void *arg);
377 static void m_mjclfree(void *arg);
378 
379 /*
380  * NOTE: Default NMBUFS must take into account a possible DOS attack
381  *	 using fd passing on unix domain sockets.
382  */
383 #ifndef NMBCLUSTERS
384 #define NMBCLUSTERS	(512 + maxusers * 16)
385 #endif
386 #ifndef MCLPH_CACHEFRAC
387 #define MCLPH_CACHEFRAC	16
388 #endif
389 #ifndef MCL_CACHEFRAC
390 #define MCL_CACHEFRAC	4
391 #endif
392 #ifndef NMBJCLUSTERS
393 #define NMBJCLUSTERS	(NMBCLUSTERS / 2)
394 #endif
395 #ifndef NMBUFS
396 #define NMBUFS		(nmbclusters * 2 + maxfiles)
397 #endif
398 
399 /*
400  * Perform sanity checks of tunables declared above.
401  */
402 static void
403 tunable_mbinit(void *dummy)
404 {
405 	/*
406 	 * This has to be done before VM init.
407 	 */
408 	nmbclusters = NMBCLUSTERS;
409 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
410 	mclph_cachefrac = MCLPH_CACHEFRAC;
411 	TUNABLE_INT_FETCH("kern.ipc.mclph_cachefrac", &mclph_cachefrac);
412 	mcl_cachefrac = MCL_CACHEFRAC;
413 	TUNABLE_INT_FETCH("kern.ipc.mcl_cachefrac", &mcl_cachefrac);
414 
415 	nmbjclusters = NMBJCLUSTERS;
416 	TUNABLE_INT_FETCH("kern.ipc.nmbjclusters", &nmbjclusters);
417 
418 	nmbufs = NMBUFS;
419 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
420 
421 	/* Sanity checks */
422 	if (nmbufs < nmbclusters * 2)
423 		nmbufs = nmbclusters * 2;
424 }
425 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
426 	tunable_mbinit, NULL);
427 
428 /* "number of clusters of pages" */
429 #define NCL_INIT	1
430 
431 #define NMB_INIT	16
432 
433 /*
434  * The mbuf object cache only guarantees that m_next and m_nextpkt are
435  * NULL and that m_data points to the beginning of the data area.  In
436  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
437  * responsibility of the caller to initialize those fields before use.
438  */
439 
440 static __inline boolean_t
441 mbuf_ctor(void *obj, void *private, int ocflags)
442 {
443 	struct mbuf *m = obj;
444 
445 	m->m_next = NULL;
446 	m->m_nextpkt = NULL;
447 	m->m_data = m->m_dat;
448 	m->m_flags = 0;
449 
450 	return (TRUE);
451 }
452 
453 /*
454  * Initialize the mbuf and the packet header fields.
455  */
456 static boolean_t
457 mbufphdr_ctor(void *obj, void *private, int ocflags)
458 {
459 	struct mbuf *m = obj;
460 
461 	m->m_next = NULL;
462 	m->m_nextpkt = NULL;
463 	m->m_data = m->m_pktdat;
464 	m->m_flags = M_PKTHDR | M_PHCACHE;
465 
466 	m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
467 	SLIST_INIT(&m->m_pkthdr.tags);
468 	m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
469 	m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
470 
471 	return (TRUE);
472 }
473 
474 /*
475  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
476  */
477 static boolean_t
478 mclmeta_ctor(void *obj, void *private, int ocflags)
479 {
480 	struct mbcluster *cl = obj;
481 	void *buf;
482 
483 	if (ocflags & M_NOWAIT)
484 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
485 	else
486 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
487 	if (buf == NULL)
488 		return (FALSE);
489 	cl->mcl_refs = 0;
490 	cl->mcl_data = buf;
491 	return (TRUE);
492 }
493 
494 static boolean_t
495 mjclmeta_ctor(void *obj, void *private, int ocflags)
496 {
497 	struct mbcluster *cl = obj;
498 	void *buf;
499 
500 	if (ocflags & M_NOWAIT)
501 		buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_NOWAIT | M_ZERO);
502 	else
503 		buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_INTWAIT | M_ZERO);
504 	if (buf == NULL)
505 		return (FALSE);
506 	cl->mcl_refs = 0;
507 	cl->mcl_data = buf;
508 	return (TRUE);
509 }
510 
511 static void
512 mclmeta_dtor(void *obj, void *private)
513 {
514 	struct mbcluster *mcl = obj;
515 
516 	KKASSERT(mcl->mcl_refs == 0);
517 	kfree(mcl->mcl_data, M_MBUFCL);
518 }
519 
520 static void
521 linkjcluster(struct mbuf *m, struct mbcluster *cl, uint size)
522 {
523 	/*
524 	 * Add the cluster to the mbuf.  The caller will detect that the
525 	 * mbuf now has an attached cluster.
526 	 */
527 	m->m_ext.ext_arg = cl;
528 	m->m_ext.ext_buf = cl->mcl_data;
529 	m->m_ext.ext_ref = m_mclref;
530 	if (size != MCLBYTES)
531 		m->m_ext.ext_free = m_mjclfree;
532 	else
533 		m->m_ext.ext_free = m_mclfree;
534 	m->m_ext.ext_size = size;
535 	atomic_add_int(&cl->mcl_refs, 1);
536 
537 	m->m_data = m->m_ext.ext_buf;
538 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
539 }
540 
541 static void
542 linkcluster(struct mbuf *m, struct mbcluster *cl)
543 {
544 	linkjcluster(m, cl, MCLBYTES);
545 }
546 
547 static boolean_t
548 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
549 {
550 	struct mbuf *m = obj;
551 	struct mbcluster *cl;
552 
553 	mbufphdr_ctor(obj, private, ocflags);
554 	cl = objcache_get(mclmeta_cache, ocflags);
555 	if (cl == NULL) {
556 		++mbstat[mycpu->gd_cpuid].m_drops;
557 		return (FALSE);
558 	}
559 	m->m_flags |= M_CLCACHE;
560 	linkcluster(m, cl);
561 	return (TRUE);
562 }
563 
564 static boolean_t
565 mbufphdrjcluster_ctor(void *obj, void *private, int ocflags)
566 {
567 	struct mbuf *m = obj;
568 	struct mbcluster *cl;
569 
570 	mbufphdr_ctor(obj, private, ocflags);
571 	cl = objcache_get(mjclmeta_cache, ocflags);
572 	if (cl == NULL) {
573 		++mbstat[mycpu->gd_cpuid].m_drops;
574 		return (FALSE);
575 	}
576 	m->m_flags |= M_CLCACHE;
577 	linkjcluster(m, cl, MJUMPAGESIZE);
578 	return (TRUE);
579 }
580 
581 static boolean_t
582 mbufcluster_ctor(void *obj, void *private, int ocflags)
583 {
584 	struct mbuf *m = obj;
585 	struct mbcluster *cl;
586 
587 	mbuf_ctor(obj, private, ocflags);
588 	cl = objcache_get(mclmeta_cache, ocflags);
589 	if (cl == NULL) {
590 		++mbstat[mycpu->gd_cpuid].m_drops;
591 		return (FALSE);
592 	}
593 	m->m_flags |= M_CLCACHE;
594 	linkcluster(m, cl);
595 	return (TRUE);
596 }
597 
598 static boolean_t
599 mbufjcluster_ctor(void *obj, void *private, int ocflags)
600 {
601 	struct mbuf *m = obj;
602 	struct mbcluster *cl;
603 
604 	mbuf_ctor(obj, private, ocflags);
605 	cl = objcache_get(mjclmeta_cache, ocflags);
606 	if (cl == NULL) {
607 		++mbstat[mycpu->gd_cpuid].m_drops;
608 		return (FALSE);
609 	}
610 	m->m_flags |= M_CLCACHE;
611 	linkjcluster(m, cl, MJUMPAGESIZE);
612 	return (TRUE);
613 }
614 
615 /*
616  * Used for both the cluster and cluster PHDR caches.
617  *
618  * The mbuf may have lost its cluster due to sharing, deal
619  * with the situation by checking M_EXT.
620  */
621 static void
622 mbufcluster_dtor(void *obj, void *private)
623 {
624 	struct mbuf *m = obj;
625 	struct mbcluster *mcl;
626 
627 	if (m->m_flags & M_EXT) {
628 		KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
629 		mcl = m->m_ext.ext_arg;
630 		KKASSERT(mcl->mcl_refs == 1);
631 		mcl->mcl_refs = 0;
632 		if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES)
633 			objcache_put(mjclmeta_cache, mcl);
634 		else
635 			objcache_put(mclmeta_cache, mcl);
636 	}
637 }
638 
639 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
640 struct objcache_malloc_args mclmeta_malloc_args =
641 	{ sizeof(struct mbcluster), M_MCLMETA };
642 
643 /* ARGSUSED*/
644 static void
645 mbinit(void *dummy)
646 {
647 	int mb_limit, cl_limit, ncl_limit, jcl_limit;
648 	int limit;
649 	int i;
650 
651 	/*
652 	 * Initialize statistics
653 	 */
654 	for (i = 0; i < ncpus; i++) {
655 		mbstat[i].m_msize = MSIZE;
656 		mbstat[i].m_mclbytes = MCLBYTES;
657 		mbstat[i].m_mjumpagesize = MJUMPAGESIZE;
658 		mbstat[i].m_minclsize = MINCLSIZE;
659 		mbstat[i].m_mlen = MLEN;
660 		mbstat[i].m_mhlen = MHLEN;
661 	}
662 
663 	/*
664 	 * Create objtect caches and save cluster limits, which will
665 	 * be used to adjust backing kmalloc pools' limit later.
666 	 */
667 
668 	mb_limit = cl_limit = 0;
669 
670 	limit = nmbufs;
671 	mbuf_cache = objcache_create("mbuf",
672 	    limit, 0,
673 	    mbuf_ctor, NULL, NULL,
674 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
675 	mb_limit += limit;
676 
677 	limit = nmbufs;
678 	mbufphdr_cache = objcache_create("mbuf pkt hdr",
679 	    limit, nmbufs / 4,
680 	    mbufphdr_ctor, NULL, NULL,
681 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
682 	mb_limit += limit;
683 
684 	ncl_limit = nmbclusters;
685 	mclmeta_cache = objcache_create("cluster mbuf",
686 	    ncl_limit, 0,
687 	    mclmeta_ctor, mclmeta_dtor, NULL,
688 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
689 	cl_limit += ncl_limit;
690 
691 	jcl_limit = nmbjclusters;
692 	mjclmeta_cache = objcache_create("jcluster mbuf",
693 	    jcl_limit, 0,
694 	    mjclmeta_ctor, mclmeta_dtor, NULL,
695 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
696 	cl_limit += jcl_limit;
697 
698 	limit = nmbclusters;
699 	mbufcluster_cache = objcache_create("mbuf + cluster",
700 	    limit, nmbclusters / mcl_cachefrac,
701 	    mbufcluster_ctor, mbufcluster_dtor, NULL,
702 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
703 	mb_limit += limit;
704 
705 	limit = nmbclusters;
706 	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
707 	    limit, nmbclusters / mclph_cachefrac,
708 	    mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
709 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
710 	mb_limit += limit;
711 
712 	limit = nmbjclusters;
713 	mbufjcluster_cache = objcache_create("mbuf + jcluster",
714 	    limit, 0,
715 	    mbufjcluster_ctor, mbufcluster_dtor, NULL,
716 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
717 	mb_limit += limit;
718 
719 	limit = nmbjclusters;
720 	mbufphdrjcluster_cache = objcache_create("mbuf pkt hdr + jcluster",
721 	    limit, 0,
722 	    mbufphdrjcluster_ctor, mbufcluster_dtor, NULL,
723 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
724 	mb_limit += limit;
725 
726 	/*
727 	 * Adjust backing kmalloc pools' limit
728 	 *
729 	 * NOTE: We raise the limit by another 1/8 to take the effect
730 	 * of loosememuse into account.
731 	 */
732 	cl_limit += cl_limit / 8;
733 	kmalloc_raise_limit(mclmeta_malloc_args.mtype,
734 	    mclmeta_malloc_args.objsize * (size_t)cl_limit);
735 	kmalloc_raise_limit(M_MBUFCL,
736 	    (MCLBYTES * (size_t)ncl_limit) +
737 	    (MJUMPAGESIZE * (size_t)jcl_limit));
738 
739 	mb_limit += mb_limit / 8;
740 	kmalloc_raise_limit(mbuf_malloc_args.mtype,
741 	    mbuf_malloc_args.objsize * (size_t)mb_limit);
742 }
743 
744 /*
745  * Return the number of references to this mbuf's data.  0 is returned
746  * if the mbuf is not M_EXT, a reference count is returned if it is
747  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
748  */
749 int
750 m_sharecount(struct mbuf *m)
751 {
752 	switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
753 	case 0:
754 		return (0);
755 	case M_EXT:
756 		return (99);
757 	case M_EXT | M_EXT_CLUSTER:
758 		return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
759 	}
760 	/* NOTREACHED */
761 	return (0);		/* to shut up compiler */
762 }
763 
764 /*
765  * change mbuf to new type
766  */
767 void
768 m_chtype(struct mbuf *m, int type)
769 {
770 	struct globaldata *gd = mycpu;
771 
772 	++mbtypes[gd->gd_cpuid].stats[type];
773 	--mbtypes[gd->gd_cpuid].stats[m->m_type];
774 	m->m_type = type;
775 }
776 
777 static void
778 m_reclaim(void)
779 {
780 	struct domain *dp;
781 	struct protosw *pr;
782 
783 	kprintf("Debug: m_reclaim() called\n");
784 
785 	SLIST_FOREACH(dp, &domains, dom_next) {
786 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
787 			if (pr->pr_drain)
788 				(*pr->pr_drain)();
789 		}
790 	}
791 	++mbstat[mycpu->gd_cpuid].m_drain;
792 }
793 
794 static __inline void
795 updatestats(struct mbuf *m, int type)
796 {
797 	struct globaldata *gd = mycpu;
798 
799 	m->m_type = type;
800 	mbuftrack(m);
801 #ifdef MBUF_DEBUG
802 	KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m));
803 	KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m));
804 #endif
805 
806 	++mbtypes[gd->gd_cpuid].stats[type];
807 	++mbstat[gd->gd_cpuid].m_mbufs;
808 
809 }
810 
811 /*
812  * Allocate an mbuf.
813  */
814 struct mbuf *
815 m_get(int how, int type)
816 {
817 	struct mbuf *m;
818 	int ntries = 0;
819 	int ocf = MBTOM(how);
820 
821 retryonce:
822 
823 	m = objcache_get(mbuf_cache, ocf);
824 
825 	if (m == NULL) {
826 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
827 			struct objcache *reclaimlist[] = {
828 				mbufphdr_cache,
829 				mbufcluster_cache,
830 				mbufphdrcluster_cache,
831 				mbufjcluster_cache,
832 				mbufphdrjcluster_cache
833 			};
834 			const int nreclaims = NELEM(reclaimlist);
835 
836 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
837 				m_reclaim();
838 			goto retryonce;
839 		}
840 		++mbstat[mycpu->gd_cpuid].m_drops;
841 		return (NULL);
842 	}
843 #ifdef MBUF_DEBUG
844 	KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m));
845 #endif
846 	m->m_len = 0;
847 
848 	updatestats(m, type);
849 	return (m);
850 }
851 
852 struct mbuf *
853 m_gethdr(int how, int type)
854 {
855 	struct mbuf *m;
856 	int ocf = MBTOM(how);
857 	int ntries = 0;
858 
859 retryonce:
860 
861 	m = objcache_get(mbufphdr_cache, ocf);
862 
863 	if (m == NULL) {
864 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
865 			struct objcache *reclaimlist[] = {
866 				mbuf_cache,
867 				mbufcluster_cache, mbufphdrcluster_cache,
868 				mbufjcluster_cache, mbufphdrjcluster_cache
869 			};
870 			const int nreclaims = NELEM(reclaimlist);
871 
872 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
873 				m_reclaim();
874 			goto retryonce;
875 		}
876 		++mbstat[mycpu->gd_cpuid].m_drops;
877 		return (NULL);
878 	}
879 #ifdef MBUF_DEBUG
880 	KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m));
881 #endif
882 	m->m_len = 0;
883 	m->m_pkthdr.len = 0;
884 
885 	updatestats(m, type);
886 	return (m);
887 }
888 
889 /*
890  * Get a mbuf (not a mbuf cluster!) and zero it.
891  * Deprecated.
892  */
893 struct mbuf *
894 m_getclr(int how, int type)
895 {
896 	struct mbuf *m;
897 
898 	m = m_get(how, type);
899 	if (m != NULL)
900 		bzero(m->m_data, MLEN);
901 	return (m);
902 }
903 
904 static struct mbuf *
905 m_getcl_cache(int how, short type, int flags, struct objcache *mbclc,
906     struct objcache *mbphclc, u_long *cl_stats)
907 {
908 	struct mbuf *m = NULL;
909 	int ocflags = MBTOM(how);
910 	int ntries = 0;
911 
912 retryonce:
913 
914 	if (flags & M_PKTHDR)
915 		m = objcache_get(mbphclc, ocflags);
916 	else
917 		m = objcache_get(mbclc, ocflags);
918 
919 	if (m == NULL) {
920 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
921 			struct objcache *reclaimlist[1];
922 
923 			if (flags & M_PKTHDR)
924 				reclaimlist[0] = mbclc;
925 			else
926 				reclaimlist[0] = mbphclc;
927 			if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
928 				m_reclaim();
929 			goto retryonce;
930 		}
931 		++mbstat[mycpu->gd_cpuid].m_drops;
932 		return (NULL);
933 	}
934 
935 #ifdef MBUF_DEBUG
936 	KASSERT(m->m_data == m->m_ext.ext_buf,
937 		("mbuf %p: bad m_data in get", m));
938 #endif
939 	m->m_type = type;
940 	m->m_len = 0;
941 	m->m_pkthdr.len = 0;	/* just do it unconditonally */
942 
943 	mbuftrack(m);
944 
945 	++mbtypes[mycpu->gd_cpuid].stats[type];
946 	++(*cl_stats);
947 	return (m);
948 }
949 
950 struct mbuf *
951 m_getjcl(int how, short type, int flags, size_t size)
952 {
953 	struct objcache *mbclc, *mbphclc;
954 	u_long *cl_stats;
955 
956 	switch (size) {
957 	case MCLBYTES:
958 		mbclc = mbufcluster_cache;
959 		mbphclc = mbufphdrcluster_cache;
960 		cl_stats = &mbstat[mycpu->gd_cpuid].m_clusters;
961 		break;
962 
963 	default:
964 		mbclc = mbufjcluster_cache;
965 		mbphclc = mbufphdrjcluster_cache;
966 		cl_stats = &mbstat[mycpu->gd_cpuid].m_jclusters;
967 		break;
968 	}
969 	return m_getcl_cache(how, type, flags, mbclc, mbphclc, cl_stats);
970 }
971 
972 /*
973  * Returns an mbuf with an attached cluster.
974  * Because many network drivers use this kind of buffers a lot, it is
975  * convenient to keep a small pool of free buffers of this kind.
976  * Even a small size such as 10 gives about 10% improvement in the
977  * forwarding rate in a bridge or router.
978  */
979 struct mbuf *
980 m_getcl(int how, short type, int flags)
981 {
982 	return m_getcl_cache(how, type, flags,
983 	    mbufcluster_cache, mbufphdrcluster_cache,
984 	    &mbstat[mycpu->gd_cpuid].m_clusters);
985 }
986 
987 /*
988  * Allocate chain of requested length.
989  */
990 struct mbuf *
991 m_getc(int len, int how, int type)
992 {
993 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
994 	int nsize;
995 
996 	while (len > 0) {
997 		n = m_getl(len, how, type, 0, &nsize);
998 		if (n == NULL)
999 			goto failed;
1000 		n->m_len = 0;
1001 		*ntail = n;
1002 		ntail = &n->m_next;
1003 		len -= nsize;
1004 	}
1005 	return (nfirst);
1006 
1007 failed:
1008 	m_freem(nfirst);
1009 	return (NULL);
1010 }
1011 
1012 /*
1013  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
1014  * and return a pointer to the head of the allocated chain. If m0 is
1015  * non-null, then we assume that it is a single mbuf or an mbuf chain to
1016  * which we want len bytes worth of mbufs and/or clusters attached, and so
1017  * if we succeed in allocating it, we will just return a pointer to m0.
1018  *
1019  * If we happen to fail at any point during the allocation, we will free
1020  * up everything we have already allocated and return NULL.
1021  *
1022  * Deprecated.  Use m_getc() and m_cat() instead.
1023  */
1024 struct mbuf *
1025 m_getm(struct mbuf *m0, int len, int type, int how)
1026 {
1027 	struct mbuf *nfirst;
1028 
1029 	nfirst = m_getc(len, how, type);
1030 
1031 	if (m0 != NULL) {
1032 		m_last(m0)->m_next = nfirst;
1033 		return (m0);
1034 	}
1035 
1036 	return (nfirst);
1037 }
1038 
1039 /*
1040  * Adds a cluster to a normal mbuf, M_EXT is set on success.
1041  * Deprecated.  Use m_getcl() instead.
1042  */
1043 void
1044 m_mclget(struct mbuf *m, int how)
1045 {
1046 	struct mbcluster *mcl;
1047 
1048 	KKASSERT((m->m_flags & M_EXT) == 0);
1049 	mcl = objcache_get(mclmeta_cache, MBTOM(how));
1050 	if (mcl != NULL) {
1051 		linkcluster(m, mcl);
1052 		++mbstat[mycpu->gd_cpuid].m_clusters;
1053 	} else {
1054 		++mbstat[mycpu->gd_cpuid].m_drops;
1055 	}
1056 }
1057 
1058 /*
1059  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
1060  * a reference to the cluster can ref it, so we are in no danger of
1061  * racing an add with a subtract.  But the operation must still be atomic
1062  * since multiple entities may have a reference on the cluster.
1063  *
1064  * m_mclfree() is almost the same but it must contend with two entities
1065  * freeing the cluster at the same time.
1066  */
1067 static void
1068 m_mclref(void *arg)
1069 {
1070 	struct mbcluster *mcl = arg;
1071 
1072 	atomic_add_int(&mcl->mcl_refs, 1);
1073 }
1074 
1075 /*
1076  * When dereferencing a cluster we have to deal with a N->0 race, where
1077  * N entities free their references simultaniously.  To do this we use
1078  * atomic_fetchadd_int().
1079  */
1080 static void
1081 m_mclfree(void *arg)
1082 {
1083 	struct mbcluster *mcl = arg;
1084 
1085 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1086 		--mbstat[mycpu->gd_cpuid].m_clusters;
1087 		objcache_put(mclmeta_cache, mcl);
1088 	}
1089 }
1090 
1091 static void
1092 m_mjclfree(void *arg)
1093 {
1094 	struct mbcluster *mcl = arg;
1095 
1096 	if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1097 		--mbstat[mycpu->gd_cpuid].m_jclusters;
1098 		objcache_put(mjclmeta_cache, mcl);
1099 	}
1100 }
1101 
1102 /*
1103  * Free a single mbuf and any associated external storage.  The successor,
1104  * if any, is returned.
1105  *
1106  * We do need to check non-first mbuf for m_aux, since some of existing
1107  * code does not call M_PREPEND properly.
1108  * (example: call to bpf_mtap from drivers)
1109  */
1110 
1111 #ifdef MBUF_DEBUG
1112 
1113 struct mbuf  *
1114 _m_free(struct mbuf *m, const char *func)
1115 
1116 #else
1117 
1118 struct mbuf *
1119 m_free(struct mbuf *m)
1120 
1121 #endif
1122 {
1123 	struct mbuf *n;
1124 	struct globaldata *gd = mycpu;
1125 
1126 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
1127 	KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m));
1128 	--mbtypes[gd->gd_cpuid].stats[m->m_type];
1129 
1130 	n = m->m_next;
1131 
1132 	/*
1133 	 * Make sure the mbuf is in constructed state before returning it
1134 	 * to the objcache.
1135 	 */
1136 	m->m_next = NULL;
1137 	mbufuntrack(m);
1138 #ifdef MBUF_DEBUG
1139 	m->m_hdr.mh_lastfunc = func;
1140 #endif
1141 #ifdef notyet
1142 	KKASSERT(m->m_nextpkt == NULL);
1143 #else
1144 	if (m->m_nextpkt != NULL) {
1145 		static int afewtimes = 10;
1146 
1147 		if (afewtimes-- > 0) {
1148 			kprintf("mfree: m->m_nextpkt != NULL\n");
1149 			print_backtrace(-1);
1150 		}
1151 		m->m_nextpkt = NULL;
1152 	}
1153 #endif
1154 	if (m->m_flags & M_PKTHDR) {
1155 		m_tag_delete_chain(m);		/* eliminate XXX JH */
1156 	}
1157 
1158 	m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
1159 
1160 	/*
1161 	 * Clean the M_PKTHDR state so we can return the mbuf to its original
1162 	 * cache.  This is based on the PHCACHE flag which tells us whether
1163 	 * the mbuf was originally allocated out of a packet-header cache
1164 	 * or a non-packet-header cache.
1165 	 */
1166 	if (m->m_flags & M_PHCACHE) {
1167 		m->m_flags |= M_PKTHDR;
1168 		m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
1169 		m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
1170 		m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
1171 		SLIST_INIT(&m->m_pkthdr.tags);
1172 	}
1173 
1174 	/*
1175 	 * Handle remaining flags combinations.  M_CLCACHE tells us whether
1176 	 * the mbuf was originally allocated from a cluster cache or not,
1177 	 * and is totally separate from whether the mbuf is currently
1178 	 * associated with a cluster.
1179 	 */
1180 	switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
1181 	case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
1182 		/*
1183 		 * mbuf+cluster cache case.  The mbuf was allocated from the
1184 		 * combined mbuf_cluster cache and can be returned to the
1185 		 * cache if the cluster hasn't been shared.
1186 		 */
1187 		if (m_sharecount(m) == 1) {
1188 			/*
1189 			 * The cluster has not been shared, we can just
1190 			 * reset the data pointer and return the mbuf
1191 			 * to the cluster cache.  Note that the reference
1192 			 * count is left intact (it is still associated with
1193 			 * an mbuf).
1194 			 */
1195 			m->m_data = m->m_ext.ext_buf;
1196 			if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) {
1197 				if (m->m_flags & M_PHCACHE)
1198 					objcache_put(mbufphdrjcluster_cache, m);
1199 				else
1200 					objcache_put(mbufjcluster_cache, m);
1201 				--mbstat[mycpu->gd_cpuid].m_jclusters;
1202 			} else {
1203 				if (m->m_flags & M_PHCACHE)
1204 					objcache_put(mbufphdrcluster_cache, m);
1205 				else
1206 					objcache_put(mbufcluster_cache, m);
1207 				--mbstat[mycpu->gd_cpuid].m_clusters;
1208 			}
1209 		} else {
1210 			/*
1211 			 * Hell.  Someone else has a ref on this cluster,
1212 			 * we have to disconnect it which means we can't
1213 			 * put it back into the mbufcluster_cache, we
1214 			 * have to destroy the mbuf.
1215 			 *
1216 			 * Other mbuf references to the cluster will typically
1217 			 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
1218 			 *
1219 			 * XXX we could try to connect another cluster to
1220 			 * it.
1221 			 */
1222 			m->m_ext.ext_free(m->m_ext.ext_arg);
1223 			m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1224 			if (m->m_ext.ext_size == MCLBYTES) {
1225 				if (m->m_flags & M_PHCACHE)
1226 					objcache_dtor(mbufphdrcluster_cache, m);
1227 				else
1228 					objcache_dtor(mbufcluster_cache, m);
1229 			} else {
1230 				if (m->m_flags & M_PHCACHE)
1231 					objcache_dtor(mbufphdrjcluster_cache, m);
1232 				else
1233 					objcache_dtor(mbufjcluster_cache, m);
1234 			}
1235 		}
1236 		break;
1237 	case M_EXT | M_EXT_CLUSTER:
1238 	case M_EXT:
1239 		/*
1240 		 * Normal cluster association case, disconnect the cluster from
1241 		 * the mbuf.  The cluster may or may not be custom.
1242 		 */
1243 		m->m_ext.ext_free(m->m_ext.ext_arg);
1244 		m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1245 		/* fall through */
1246 	case 0:
1247 		/*
1248 		 * return the mbuf to the mbuf cache.
1249 		 */
1250 		if (m->m_flags & M_PHCACHE) {
1251 			m->m_data = m->m_pktdat;
1252 			objcache_put(mbufphdr_cache, m);
1253 		} else {
1254 			m->m_data = m->m_dat;
1255 			objcache_put(mbuf_cache, m);
1256 		}
1257 		--mbstat[mycpu->gd_cpuid].m_mbufs;
1258 		break;
1259 	default:
1260 		if (!panicstr)
1261 			panic("bad mbuf flags %p %08x", m, m->m_flags);
1262 		break;
1263 	}
1264 	return (n);
1265 }
1266 
1267 #ifdef MBUF_DEBUG
1268 
1269 void
1270 _m_freem(struct mbuf *m, const char *func)
1271 {
1272 	while (m)
1273 		m = _m_free(m, func);
1274 }
1275 
1276 #else
1277 
1278 void
1279 m_freem(struct mbuf *m)
1280 {
1281 	while (m)
1282 		m = m_free(m);
1283 }
1284 
1285 #endif
1286 
1287 void
1288 m_extadd(struct mbuf *m, caddr_t buf, u_int size,  void (*reff)(void *),
1289     void (*freef)(void *), void *arg)
1290 {
1291 	m->m_ext.ext_arg = arg;
1292 	m->m_ext.ext_buf = buf;
1293 	m->m_ext.ext_ref = reff;
1294 	m->m_ext.ext_free = freef;
1295 	m->m_ext.ext_size = size;
1296 	reff(arg);
1297 	m->m_data = buf;
1298 	m->m_flags |= M_EXT;
1299 }
1300 
1301 /*
1302  * mbuf utility routines
1303  */
1304 
1305 /*
1306  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
1307  * copy junk along.
1308  */
1309 struct mbuf *
1310 m_prepend(struct mbuf *m, int len, int how)
1311 {
1312 	struct mbuf *mn;
1313 
1314 	if (m->m_flags & M_PKTHDR)
1315 	    mn = m_gethdr(how, m->m_type);
1316 	else
1317 	    mn = m_get(how, m->m_type);
1318 	if (mn == NULL) {
1319 		m_freem(m);
1320 		return (NULL);
1321 	}
1322 	if (m->m_flags & M_PKTHDR)
1323 		M_MOVE_PKTHDR(mn, m);
1324 	mn->m_next = m;
1325 	m = mn;
1326 	if (len < MHLEN)
1327 		MH_ALIGN(m, len);
1328 	m->m_len = len;
1329 	return (m);
1330 }
1331 
1332 /*
1333  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1334  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1335  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1336  * Note that the copy is read-only, because clusters are not copied,
1337  * only their reference counts are incremented.
1338  */
1339 struct mbuf *
1340 m_copym(const struct mbuf *m, int off0, int len, int wait)
1341 {
1342 	struct mbuf *n, **np;
1343 	int off = off0;
1344 	struct mbuf *top;
1345 	int copyhdr = 0;
1346 
1347 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1348 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1349 	if (off == 0 && (m->m_flags & M_PKTHDR))
1350 		copyhdr = 1;
1351 	while (off > 0) {
1352 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1353 		if (off < m->m_len)
1354 			break;
1355 		off -= m->m_len;
1356 		m = m->m_next;
1357 	}
1358 	np = &top;
1359 	top = NULL;
1360 	while (len > 0) {
1361 		if (m == NULL) {
1362 			KASSERT(len == M_COPYALL,
1363 			    ("m_copym, length > size of mbuf chain"));
1364 			break;
1365 		}
1366 		/*
1367 		 * Because we are sharing any cluster attachment below,
1368 		 * be sure to get an mbuf that does not have a cluster
1369 		 * associated with it.
1370 		 */
1371 		if (copyhdr)
1372 			n = m_gethdr(wait, m->m_type);
1373 		else
1374 			n = m_get(wait, m->m_type);
1375 		*np = n;
1376 		if (n == NULL)
1377 			goto nospace;
1378 		if (copyhdr) {
1379 			if (!m_dup_pkthdr(n, m, wait))
1380 				goto nospace;
1381 			if (len == M_COPYALL)
1382 				n->m_pkthdr.len -= off0;
1383 			else
1384 				n->m_pkthdr.len = len;
1385 			copyhdr = 0;
1386 		}
1387 		n->m_len = min(len, m->m_len - off);
1388 		if (m->m_flags & M_EXT) {
1389 			KKASSERT((n->m_flags & M_EXT) == 0);
1390 			n->m_data = m->m_data + off;
1391 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1392 			n->m_ext = m->m_ext;
1393 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1394 		} else {
1395 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1396 			    (unsigned)n->m_len);
1397 		}
1398 		if (len != M_COPYALL)
1399 			len -= n->m_len;
1400 		off = 0;
1401 		m = m->m_next;
1402 		np = &n->m_next;
1403 	}
1404 	if (top == NULL)
1405 		++mbstat[mycpu->gd_cpuid].m_mcfail;
1406 	return (top);
1407 nospace:
1408 	m_freem(top);
1409 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1410 	return (NULL);
1411 }
1412 
1413 /*
1414  * Copy an entire packet, including header (which must be present).
1415  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1416  * Note that the copy is read-only, because clusters are not copied,
1417  * only their reference counts are incremented.
1418  * Preserve alignment of the first mbuf so if the creator has left
1419  * some room at the beginning (e.g. for inserting protocol headers)
1420  * the copies also have the room available.
1421  */
1422 struct mbuf *
1423 m_copypacket(struct mbuf *m, int how)
1424 {
1425 	struct mbuf *top, *n, *o;
1426 
1427 	n = m_gethdr(how, m->m_type);
1428 	top = n;
1429 	if (!n)
1430 		goto nospace;
1431 
1432 	if (!m_dup_pkthdr(n, m, how))
1433 		goto nospace;
1434 	n->m_len = m->m_len;
1435 	if (m->m_flags & M_EXT) {
1436 		KKASSERT((n->m_flags & M_EXT) == 0);
1437 		n->m_data = m->m_data;
1438 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1439 		n->m_ext = m->m_ext;
1440 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1441 	} else {
1442 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1443 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1444 	}
1445 
1446 	m = m->m_next;
1447 	while (m) {
1448 		o = m_get(how, m->m_type);
1449 		if (!o)
1450 			goto nospace;
1451 
1452 		n->m_next = o;
1453 		n = n->m_next;
1454 
1455 		n->m_len = m->m_len;
1456 		if (m->m_flags & M_EXT) {
1457 			KKASSERT((n->m_flags & M_EXT) == 0);
1458 			n->m_data = m->m_data;
1459 			m->m_ext.ext_ref(m->m_ext.ext_arg);
1460 			n->m_ext = m->m_ext;
1461 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1462 		} else {
1463 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1464 		}
1465 
1466 		m = m->m_next;
1467 	}
1468 	return top;
1469 nospace:
1470 	m_freem(top);
1471 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1472 	return (NULL);
1473 }
1474 
1475 /*
1476  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1477  * continuing for "len" bytes, into the indicated buffer.
1478  */
1479 void
1480 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1481 {
1482 	unsigned count;
1483 
1484 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1485 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1486 	while (off > 0) {
1487 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1488 		if (off < m->m_len)
1489 			break;
1490 		off -= m->m_len;
1491 		m = m->m_next;
1492 	}
1493 	while (len > 0) {
1494 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1495 		count = min(m->m_len - off, len);
1496 		bcopy(mtod(m, caddr_t) + off, cp, count);
1497 		len -= count;
1498 		cp += count;
1499 		off = 0;
1500 		m = m->m_next;
1501 	}
1502 }
1503 
1504 /*
1505  * Copy a packet header mbuf chain into a completely new chain, including
1506  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1507  * you need a writable copy of an mbuf chain.
1508  */
1509 struct mbuf *
1510 m_dup(struct mbuf *m, int how)
1511 {
1512 	struct mbuf **p, *top = NULL;
1513 	int remain, moff, nsize;
1514 
1515 	/* Sanity check */
1516 	if (m == NULL)
1517 		return (NULL);
1518 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1519 
1520 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1521 	remain = m->m_pkthdr.len;
1522 	moff = 0;
1523 	p = &top;
1524 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1525 		struct mbuf *n;
1526 
1527 		/* Get the next new mbuf */
1528 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1529 			   &nsize);
1530 		if (n == NULL)
1531 			goto nospace;
1532 		if (top == NULL)
1533 			if (!m_dup_pkthdr(n, m, how))
1534 				goto nospace0;
1535 
1536 		/* Link it into the new chain */
1537 		*p = n;
1538 		p = &n->m_next;
1539 
1540 		/* Copy data from original mbuf(s) into new mbuf */
1541 		n->m_len = 0;
1542 		while (n->m_len < nsize && m != NULL) {
1543 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1544 
1545 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1546 			moff += chunk;
1547 			n->m_len += chunk;
1548 			remain -= chunk;
1549 			if (moff == m->m_len) {
1550 				m = m->m_next;
1551 				moff = 0;
1552 			}
1553 		}
1554 
1555 		/* Check correct total mbuf length */
1556 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1557 			("%s: bogus m_pkthdr.len", __func__));
1558 	}
1559 	return (top);
1560 
1561 nospace:
1562 	m_freem(top);
1563 nospace0:
1564 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1565 	return (NULL);
1566 }
1567 
1568 /*
1569  * Copy the non-packet mbuf data chain into a new set of mbufs, including
1570  * copying any mbuf clusters.  This is typically used to realign a data
1571  * chain by nfs_realign().
1572  *
1573  * The original chain is left intact.  how should be MB_WAIT or MB_DONTWAIT
1574  * and NULL can be returned if MB_DONTWAIT is passed.
1575  *
1576  * Be careful to use cluster mbufs, a large mbuf chain converted to non
1577  * cluster mbufs can exhaust our supply of mbufs.
1578  */
1579 struct mbuf *
1580 m_dup_data(struct mbuf *m, int how)
1581 {
1582 	struct mbuf **p, *n, *top = NULL;
1583 	int mlen, moff, chunk, gsize, nsize;
1584 
1585 	/*
1586 	 * Degenerate case
1587 	 */
1588 	if (m == NULL)
1589 		return (NULL);
1590 
1591 	/*
1592 	 * Optimize the mbuf allocation but do not get too carried away.
1593 	 */
1594 	if (m->m_next || m->m_len > MLEN)
1595 		if (m->m_flags & M_EXT && m->m_ext.ext_size == MCLBYTES)
1596 			gsize = MCLBYTES;
1597 		else
1598 			gsize = MJUMPAGESIZE;
1599 	else
1600 		gsize = MLEN;
1601 
1602 	/* Chain control */
1603 	p = &top;
1604 	n = NULL;
1605 	nsize = 0;
1606 
1607 	/*
1608 	 * Scan the mbuf chain until nothing is left, the new mbuf chain
1609 	 * will be allocated on the fly as needed.
1610 	 */
1611 	while (m) {
1612 		mlen = m->m_len;
1613 		moff = 0;
1614 
1615 		while (mlen) {
1616 			KKASSERT(m->m_type == MT_DATA);
1617 			if (n == NULL) {
1618 				n = m_getl(gsize, how, MT_DATA, 0, &nsize);
1619 				n->m_len = 0;
1620 				if (n == NULL)
1621 					goto nospace;
1622 				*p = n;
1623 				p = &n->m_next;
1624 			}
1625 			chunk = imin(mlen, nsize);
1626 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1627 			mlen -= chunk;
1628 			moff += chunk;
1629 			n->m_len += chunk;
1630 			nsize -= chunk;
1631 			if (nsize == 0)
1632 				n = NULL;
1633 		}
1634 		m = m->m_next;
1635 	}
1636 	*p = NULL;
1637 	return(top);
1638 nospace:
1639 	*p = NULL;
1640 	m_freem(top);
1641 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1642 	return (NULL);
1643 }
1644 
1645 /*
1646  * Concatenate mbuf chain n to m.
1647  * Both chains must be of the same type (e.g. MT_DATA).
1648  * Any m_pkthdr is not updated.
1649  */
1650 void
1651 m_cat(struct mbuf *m, struct mbuf *n)
1652 {
1653 	m = m_last(m);
1654 	while (n) {
1655 		if (m->m_flags & M_EXT ||
1656 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1657 			/* just join the two chains */
1658 			m->m_next = n;
1659 			return;
1660 		}
1661 		/* splat the data from one into the other */
1662 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1663 		    (u_int)n->m_len);
1664 		m->m_len += n->m_len;
1665 		n = m_free(n);
1666 	}
1667 }
1668 
1669 void
1670 m_adj(struct mbuf *mp, int req_len)
1671 {
1672 	int len = req_len;
1673 	struct mbuf *m;
1674 	int count;
1675 
1676 	if ((m = mp) == NULL)
1677 		return;
1678 	if (len >= 0) {
1679 		/*
1680 		 * Trim from head.
1681 		 */
1682 		while (m != NULL && len > 0) {
1683 			if (m->m_len <= len) {
1684 				len -= m->m_len;
1685 				m->m_len = 0;
1686 				m = m->m_next;
1687 			} else {
1688 				m->m_len -= len;
1689 				m->m_data += len;
1690 				len = 0;
1691 			}
1692 		}
1693 		m = mp;
1694 		if (mp->m_flags & M_PKTHDR)
1695 			m->m_pkthdr.len -= (req_len - len);
1696 	} else {
1697 		/*
1698 		 * Trim from tail.  Scan the mbuf chain,
1699 		 * calculating its length and finding the last mbuf.
1700 		 * If the adjustment only affects this mbuf, then just
1701 		 * adjust and return.  Otherwise, rescan and truncate
1702 		 * after the remaining size.
1703 		 */
1704 		len = -len;
1705 		count = 0;
1706 		for (;;) {
1707 			count += m->m_len;
1708 			if (m->m_next == NULL)
1709 				break;
1710 			m = m->m_next;
1711 		}
1712 		if (m->m_len >= len) {
1713 			m->m_len -= len;
1714 			if (mp->m_flags & M_PKTHDR)
1715 				mp->m_pkthdr.len -= len;
1716 			return;
1717 		}
1718 		count -= len;
1719 		if (count < 0)
1720 			count = 0;
1721 		/*
1722 		 * Correct length for chain is "count".
1723 		 * Find the mbuf with last data, adjust its length,
1724 		 * and toss data from remaining mbufs on chain.
1725 		 */
1726 		m = mp;
1727 		if (m->m_flags & M_PKTHDR)
1728 			m->m_pkthdr.len = count;
1729 		for (; m; m = m->m_next) {
1730 			if (m->m_len >= count) {
1731 				m->m_len = count;
1732 				break;
1733 			}
1734 			count -= m->m_len;
1735 		}
1736 		while (m->m_next)
1737 			(m = m->m_next) ->m_len = 0;
1738 	}
1739 }
1740 
1741 /*
1742  * Set the m_data pointer of a newly-allocated mbuf
1743  * to place an object of the specified size at the
1744  * end of the mbuf, longword aligned.
1745  */
1746 void
1747 m_align(struct mbuf *m, int len)
1748 {
1749 	int adjust;
1750 
1751 	if (m->m_flags & M_EXT)
1752 		adjust = m->m_ext.ext_size - len;
1753 	else if (m->m_flags & M_PKTHDR)
1754 		adjust = MHLEN - len;
1755 	else
1756 		adjust = MLEN - len;
1757 	m->m_data += adjust &~ (sizeof(long)-1);
1758 }
1759 
1760 /*
1761  * Create a writable copy of the mbuf chain.  While doing this
1762  * we compact the chain with a goal of producing a chain with
1763  * at most two mbufs.  The second mbuf in this chain is likely
1764  * to be a cluster.  The primary purpose of this work is to create
1765  * a writable packet for encryption, compression, etc.  The
1766  * secondary goal is to linearize the data so the data can be
1767  * passed to crypto hardware in the most efficient manner possible.
1768  */
1769 struct mbuf *
1770 m_unshare(struct mbuf *m0, int how)
1771 {
1772 	struct mbuf *m, *mprev;
1773 	struct mbuf *n, *mfirst, *mlast;
1774 	int len, off;
1775 
1776 	mprev = NULL;
1777 	for (m = m0; m != NULL; m = mprev->m_next) {
1778 		/*
1779 		 * Regular mbufs are ignored unless there's a cluster
1780 		 * in front of it that we can use to coalesce.  We do
1781 		 * the latter mainly so later clusters can be coalesced
1782 		 * also w/o having to handle them specially (i.e. convert
1783 		 * mbuf+cluster -> cluster).  This optimization is heavily
1784 		 * influenced by the assumption that we're running over
1785 		 * Ethernet where MCLBYTES is large enough that the max
1786 		 * packet size will permit lots of coalescing into a
1787 		 * single cluster.  This in turn permits efficient
1788 		 * crypto operations, especially when using hardware.
1789 		 */
1790 		if ((m->m_flags & M_EXT) == 0) {
1791 			if (mprev && (mprev->m_flags & M_EXT) &&
1792 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
1793 				/* XXX: this ignores mbuf types */
1794 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1795 				       mtod(m, caddr_t), m->m_len);
1796 				mprev->m_len += m->m_len;
1797 				mprev->m_next = m->m_next;	/* unlink from chain */
1798 				m_free(m);			/* reclaim mbuf */
1799 			} else {
1800 				mprev = m;
1801 			}
1802 			continue;
1803 		}
1804 		/*
1805 		 * Writable mbufs are left alone (for now).
1806 		 */
1807 		if (M_WRITABLE(m)) {
1808 			mprev = m;
1809 			continue;
1810 		}
1811 
1812 		/*
1813 		 * Not writable, replace with a copy or coalesce with
1814 		 * the previous mbuf if possible (since we have to copy
1815 		 * it anyway, we try to reduce the number of mbufs and
1816 		 * clusters so that future work is easier).
1817 		 */
1818 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
1819 		/* NB: we only coalesce into a cluster or larger */
1820 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
1821 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
1822 			/* XXX: this ignores mbuf types */
1823 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1824 			       mtod(m, caddr_t), m->m_len);
1825 			mprev->m_len += m->m_len;
1826 			mprev->m_next = m->m_next;	/* unlink from chain */
1827 			m_free(m);			/* reclaim mbuf */
1828 			continue;
1829 		}
1830 
1831 		/*
1832 		 * Allocate new space to hold the copy...
1833 		 */
1834 		/* XXX why can M_PKTHDR be set past the first mbuf? */
1835 		if (mprev == NULL && (m->m_flags & M_PKTHDR)) {
1836 			/*
1837 			 * NB: if a packet header is present we must
1838 			 * allocate the mbuf separately from any cluster
1839 			 * because M_MOVE_PKTHDR will smash the data
1840 			 * pointer and drop the M_EXT marker.
1841 			 */
1842 			MGETHDR(n, how, m->m_type);
1843 			if (n == NULL) {
1844 				m_freem(m0);
1845 				return (NULL);
1846 			}
1847 			M_MOVE_PKTHDR(n, m);
1848 			MCLGET(n, how);
1849 			if ((n->m_flags & M_EXT) == 0) {
1850 				m_free(n);
1851 				m_freem(m0);
1852 				return (NULL);
1853 			}
1854 		} else {
1855 			n = m_getcl(how, m->m_type, m->m_flags);
1856 			if (n == NULL) {
1857 				m_freem(m0);
1858 				return (NULL);
1859 			}
1860 		}
1861 		/*
1862 		 * ... and copy the data.  We deal with jumbo mbufs
1863 		 * (i.e. m_len > MCLBYTES) by splitting them into
1864 		 * clusters.  We could just malloc a buffer and make
1865 		 * it external but too many device drivers don't know
1866 		 * how to break up the non-contiguous memory when
1867 		 * doing DMA.
1868 		 */
1869 		len = m->m_len;
1870 		off = 0;
1871 		mfirst = n;
1872 		mlast = NULL;
1873 		for (;;) {
1874 			int cc = min(len, MCLBYTES);
1875 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
1876 			n->m_len = cc;
1877 			if (mlast != NULL)
1878 				mlast->m_next = n;
1879 			mlast = n;
1880 
1881 			len -= cc;
1882 			if (len <= 0)
1883 				break;
1884 			off += cc;
1885 
1886 			n = m_getcl(how, m->m_type, m->m_flags);
1887 			if (n == NULL) {
1888 				m_freem(mfirst);
1889 				m_freem(m0);
1890 				return (NULL);
1891 			}
1892 		}
1893 		n->m_next = m->m_next;
1894 		if (mprev == NULL)
1895 			m0 = mfirst;		/* new head of chain */
1896 		else
1897 			mprev->m_next = mfirst;	/* replace old mbuf */
1898 		m_free(m);			/* release old mbuf */
1899 		mprev = mfirst;
1900 	}
1901 	return (m0);
1902 }
1903 
1904 /*
1905  * Rearrange an mbuf chain so that len bytes are contiguous
1906  * and in the data area of an mbuf (so that mtod will work for a structure
1907  * of size len).  Returns the resulting mbuf chain on success, frees it and
1908  * returns null on failure.  If there is room, it will add up to
1909  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1910  * avoid being called next time.
1911  */
1912 struct mbuf *
1913 m_pullup(struct mbuf *n, int len)
1914 {
1915 	struct mbuf *m;
1916 	int count;
1917 	int space;
1918 
1919 	/*
1920 	 * If first mbuf has no cluster, and has room for len bytes
1921 	 * without shifting current data, pullup into it,
1922 	 * otherwise allocate a new mbuf to prepend to the chain.
1923 	 */
1924 	if (!(n->m_flags & M_EXT) &&
1925 	    n->m_data + len < &n->m_dat[MLEN] &&
1926 	    n->m_next) {
1927 		if (n->m_len >= len)
1928 			return (n);
1929 		m = n;
1930 		n = n->m_next;
1931 		len -= m->m_len;
1932 	} else {
1933 		if (len > MHLEN)
1934 			goto bad;
1935 		if (n->m_flags & M_PKTHDR)
1936 			m = m_gethdr(MB_DONTWAIT, n->m_type);
1937 		else
1938 			m = m_get(MB_DONTWAIT, n->m_type);
1939 		if (m == NULL)
1940 			goto bad;
1941 		m->m_len = 0;
1942 		if (n->m_flags & M_PKTHDR)
1943 			M_MOVE_PKTHDR(m, n);
1944 	}
1945 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1946 	do {
1947 		count = min(min(max(len, max_protohdr), space), n->m_len);
1948 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1949 		  (unsigned)count);
1950 		len -= count;
1951 		m->m_len += count;
1952 		n->m_len -= count;
1953 		space -= count;
1954 		if (n->m_len)
1955 			n->m_data += count;
1956 		else
1957 			n = m_free(n);
1958 	} while (len > 0 && n);
1959 	if (len > 0) {
1960 		m_free(m);
1961 		goto bad;
1962 	}
1963 	m->m_next = n;
1964 	return (m);
1965 bad:
1966 	m_freem(n);
1967 	++mbstat[mycpu->gd_cpuid].m_mcfail;
1968 	return (NULL);
1969 }
1970 
1971 /*
1972  * Partition an mbuf chain in two pieces, returning the tail --
1973  * all but the first len0 bytes.  In case of failure, it returns NULL and
1974  * attempts to restore the chain to its original state.
1975  *
1976  * Note that the resulting mbufs might be read-only, because the new
1977  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1978  * the "breaking point" happens to lie within a cluster mbuf. Use the
1979  * M_WRITABLE() macro to check for this case.
1980  */
1981 struct mbuf *
1982 m_split(struct mbuf *m0, int len0, int wait)
1983 {
1984 	struct mbuf *m, *n;
1985 	unsigned len = len0, remain;
1986 
1987 	for (m = m0; m && len > m->m_len; m = m->m_next)
1988 		len -= m->m_len;
1989 	if (m == NULL)
1990 		return (NULL);
1991 	remain = m->m_len - len;
1992 	if (m0->m_flags & M_PKTHDR) {
1993 		n = m_gethdr(wait, m0->m_type);
1994 		if (n == NULL)
1995 			return (NULL);
1996 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1997 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1998 		m0->m_pkthdr.len = len0;
1999 		if (m->m_flags & M_EXT)
2000 			goto extpacket;
2001 		if (remain > MHLEN) {
2002 			/* m can't be the lead packet */
2003 			MH_ALIGN(n, 0);
2004 			n->m_next = m_split(m, len, wait);
2005 			if (n->m_next == NULL) {
2006 				m_free(n);
2007 				return (NULL);
2008 			} else {
2009 				n->m_len = 0;
2010 				return (n);
2011 			}
2012 		} else
2013 			MH_ALIGN(n, remain);
2014 	} else if (remain == 0) {
2015 		n = m->m_next;
2016 		m->m_next = NULL;
2017 		return (n);
2018 	} else {
2019 		n = m_get(wait, m->m_type);
2020 		if (n == NULL)
2021 			return (NULL);
2022 		M_ALIGN(n, remain);
2023 	}
2024 extpacket:
2025 	if (m->m_flags & M_EXT) {
2026 		KKASSERT((n->m_flags & M_EXT) == 0);
2027 		n->m_data = m->m_data + len;
2028 		m->m_ext.ext_ref(m->m_ext.ext_arg);
2029 		n->m_ext = m->m_ext;
2030 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
2031 	} else {
2032 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
2033 	}
2034 	n->m_len = remain;
2035 	m->m_len = len;
2036 	n->m_next = m->m_next;
2037 	m->m_next = NULL;
2038 	return (n);
2039 }
2040 
2041 /*
2042  * Routine to copy from device local memory into mbufs.
2043  * Note: "offset" is ill-defined and always called as 0, so ignore it.
2044  */
2045 struct mbuf *
2046 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
2047     void (*copy)(volatile const void *from, volatile void *to, size_t length))
2048 {
2049 	struct mbuf *m, *mfirst = NULL, **mtail;
2050 	int nsize, flags;
2051 
2052 	if (copy == NULL)
2053 		copy = bcopy;
2054 	mtail = &mfirst;
2055 	flags = M_PKTHDR;
2056 
2057 	while (len > 0) {
2058 		m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
2059 		if (m == NULL) {
2060 			m_freem(mfirst);
2061 			return (NULL);
2062 		}
2063 		m->m_len = min(len, nsize);
2064 
2065 		if (flags & M_PKTHDR) {
2066 			if (len + max_linkhdr <= nsize)
2067 				m->m_data += max_linkhdr;
2068 			m->m_pkthdr.rcvif = ifp;
2069 			m->m_pkthdr.len = len;
2070 			flags = 0;
2071 		}
2072 
2073 		copy(buf, m->m_data, (unsigned)m->m_len);
2074 		buf += m->m_len;
2075 		len -= m->m_len;
2076 		*mtail = m;
2077 		mtail = &m->m_next;
2078 	}
2079 
2080 	return (mfirst);
2081 }
2082 
2083 /*
2084  * Routine to pad mbuf to the specified length 'padto'.
2085  */
2086 int
2087 m_devpad(struct mbuf *m, int padto)
2088 {
2089 	struct mbuf *last = NULL;
2090 	int padlen;
2091 
2092 	if (padto <= m->m_pkthdr.len)
2093 		return 0;
2094 
2095 	padlen = padto - m->m_pkthdr.len;
2096 
2097 	/* if there's only the packet-header and we can pad there, use it. */
2098 	if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) {
2099 		last = m;
2100 	} else {
2101 		/*
2102 		 * Walk packet chain to find last mbuf. We will either
2103 		 * pad there, or append a new mbuf and pad it
2104 		 */
2105 		for (last = m; last->m_next != NULL; last = last->m_next)
2106 			; /* EMPTY */
2107 
2108 		/* `last' now points to last in chain. */
2109 		if (M_TRAILINGSPACE(last) < padlen) {
2110 			struct mbuf *n;
2111 
2112 			/* Allocate new empty mbuf, pad it.  Compact later. */
2113 			MGET(n, MB_DONTWAIT, MT_DATA);
2114 			if (n == NULL)
2115 				return ENOBUFS;
2116 			n->m_len = 0;
2117 			last->m_next = n;
2118 			last = n;
2119 		}
2120 	}
2121 	KKASSERT(M_TRAILINGSPACE(last) >= padlen);
2122 	KKASSERT(M_WRITABLE(last));
2123 
2124 	/* Now zero the pad area */
2125 	bzero(mtod(last, char *) + last->m_len, padlen);
2126 	last->m_len += padlen;
2127 	m->m_pkthdr.len += padlen;
2128 	return 0;
2129 }
2130 
2131 /*
2132  * Copy data from a buffer back into the indicated mbuf chain,
2133  * starting "off" bytes from the beginning, extending the mbuf
2134  * chain if necessary.
2135  */
2136 void
2137 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
2138 {
2139 	int mlen;
2140 	struct mbuf *m = m0, *n;
2141 	int totlen = 0;
2142 
2143 	if (m0 == NULL)
2144 		return;
2145 	while (off > (mlen = m->m_len)) {
2146 		off -= mlen;
2147 		totlen += mlen;
2148 		if (m->m_next == NULL) {
2149 			n = m_getclr(MB_DONTWAIT, m->m_type);
2150 			if (n == NULL)
2151 				goto out;
2152 			n->m_len = min(MLEN, len + off);
2153 			m->m_next = n;
2154 		}
2155 		m = m->m_next;
2156 	}
2157 	while (len > 0) {
2158 		mlen = min (m->m_len - off, len);
2159 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
2160 		cp += mlen;
2161 		len -= mlen;
2162 		mlen += off;
2163 		off = 0;
2164 		totlen += mlen;
2165 		if (len == 0)
2166 			break;
2167 		if (m->m_next == NULL) {
2168 			n = m_get(MB_DONTWAIT, m->m_type);
2169 			if (n == NULL)
2170 				break;
2171 			n->m_len = min(MLEN, len);
2172 			m->m_next = n;
2173 		}
2174 		m = m->m_next;
2175 	}
2176 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
2177 		m->m_pkthdr.len = totlen;
2178 }
2179 
2180 /*
2181  * Append the specified data to the indicated mbuf chain,
2182  * Extend the mbuf chain if the new data does not fit in
2183  * existing space.
2184  *
2185  * Return 1 if able to complete the job; otherwise 0.
2186  */
2187 int
2188 m_append(struct mbuf *m0, int len, c_caddr_t cp)
2189 {
2190 	struct mbuf *m, *n;
2191 	int remainder, space;
2192 
2193 	for (m = m0; m->m_next != NULL; m = m->m_next)
2194 		;
2195 	remainder = len;
2196 	space = M_TRAILINGSPACE(m);
2197 	if (space > 0) {
2198 		/*
2199 		 * Copy into available space.
2200 		 */
2201 		if (space > remainder)
2202 			space = remainder;
2203 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2204 		m->m_len += space;
2205 		cp += space, remainder -= space;
2206 	}
2207 	while (remainder > 0) {
2208 		/*
2209 		 * Allocate a new mbuf; could check space
2210 		 * and allocate a cluster instead.
2211 		 */
2212 		n = m_get(MB_DONTWAIT, m->m_type);
2213 		if (n == NULL)
2214 			break;
2215 		n->m_len = min(MLEN, remainder);
2216 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2217 		cp += n->m_len, remainder -= n->m_len;
2218 		m->m_next = n;
2219 		m = n;
2220 	}
2221 	if (m0->m_flags & M_PKTHDR)
2222 		m0->m_pkthdr.len += len - remainder;
2223 	return (remainder == 0);
2224 }
2225 
2226 /*
2227  * Apply function f to the data in an mbuf chain starting "off" bytes from
2228  * the beginning, continuing for "len" bytes.
2229  */
2230 int
2231 m_apply(struct mbuf *m, int off, int len,
2232     int (*f)(void *, void *, u_int), void *arg)
2233 {
2234 	u_int count;
2235 	int rval;
2236 
2237 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
2238 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
2239 	while (off > 0) {
2240 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2241 		if (off < m->m_len)
2242 			break;
2243 		off -= m->m_len;
2244 		m = m->m_next;
2245 	}
2246 	while (len > 0) {
2247 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2248 		count = min(m->m_len - off, len);
2249 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
2250 		if (rval)
2251 			return (rval);
2252 		len -= count;
2253 		off = 0;
2254 		m = m->m_next;
2255 	}
2256 	return (0);
2257 }
2258 
2259 /*
2260  * Return a pointer to mbuf/offset of location in mbuf chain.
2261  */
2262 struct mbuf *
2263 m_getptr(struct mbuf *m, int loc, int *off)
2264 {
2265 
2266 	while (loc >= 0) {
2267 		/* Normal end of search. */
2268 		if (m->m_len > loc) {
2269 			*off = loc;
2270 			return (m);
2271 		} else {
2272 			loc -= m->m_len;
2273 			if (m->m_next == NULL) {
2274 				if (loc == 0) {
2275 					/* Point at the end of valid data. */
2276 					*off = m->m_len;
2277 					return (m);
2278 				}
2279 				return (NULL);
2280 			}
2281 			m = m->m_next;
2282 		}
2283 	}
2284 	return (NULL);
2285 }
2286 
2287 void
2288 m_print(const struct mbuf *m)
2289 {
2290 	int len;
2291 	const struct mbuf *m2;
2292 	char *hexstr;
2293 
2294 	len = m->m_pkthdr.len;
2295 	m2 = m;
2296 	hexstr = kmalloc(HEX_NCPYLEN(len), M_TEMP, M_ZERO | M_WAITOK);
2297 	while (len) {
2298 		kprintf("%p %s\n", m2, hexncpy(m2->m_data, m2->m_len, hexstr,
2299 			HEX_NCPYLEN(m2->m_len), "-"));
2300 		len -= m2->m_len;
2301 		m2 = m2->m_next;
2302 	}
2303 	kfree(hexstr, M_TEMP);
2304 	return;
2305 }
2306 
2307 /*
2308  * "Move" mbuf pkthdr from "from" to "to".
2309  * "from" must have M_PKTHDR set, and "to" must be empty.
2310  */
2311 void
2312 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
2313 {
2314 	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
2315 
2316 	to->m_flags |= from->m_flags & M_COPYFLAGS;
2317 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
2318 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
2319 }
2320 
2321 /*
2322  * Duplicate "from"'s mbuf pkthdr in "to".
2323  * "from" must have M_PKTHDR set, and "to" must be empty.
2324  * In particular, this does a deep copy of the packet tags.
2325  */
2326 int
2327 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
2328 {
2329 	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
2330 
2331 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
2332 		      (to->m_flags & ~M_COPYFLAGS);
2333 	to->m_pkthdr = from->m_pkthdr;
2334 	SLIST_INIT(&to->m_pkthdr.tags);
2335 	return (m_tag_copy_chain(to, from, how));
2336 }
2337 
2338 /*
2339  * Defragment a mbuf chain, returning the shortest possible
2340  * chain of mbufs and clusters.  If allocation fails and
2341  * this cannot be completed, NULL will be returned, but
2342  * the passed in chain will be unchanged.  Upon success,
2343  * the original chain will be freed, and the new chain
2344  * will be returned.
2345  *
2346  * If a non-packet header is passed in, the original
2347  * mbuf (chain?) will be returned unharmed.
2348  *
2349  * m_defrag_nofree doesn't free the passed in mbuf.
2350  */
2351 struct mbuf *
2352 m_defrag(struct mbuf *m0, int how)
2353 {
2354 	struct mbuf *m_new;
2355 
2356 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
2357 		return (NULL);
2358 	if (m_new != m0)
2359 		m_freem(m0);
2360 	return (m_new);
2361 }
2362 
2363 struct mbuf *
2364 m_defrag_nofree(struct mbuf *m0, int how)
2365 {
2366 	struct mbuf	*m_new = NULL, *m_final = NULL;
2367 	int		progress = 0, length, nsize;
2368 
2369 	if (!(m0->m_flags & M_PKTHDR))
2370 		return (m0);
2371 
2372 #ifdef MBUF_STRESS_TEST
2373 	if (m_defragrandomfailures) {
2374 		int temp = karc4random() & 0xff;
2375 		if (temp == 0xba)
2376 			goto nospace;
2377 	}
2378 #endif
2379 
2380 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
2381 	if (m_final == NULL)
2382 		goto nospace;
2383 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
2384 
2385 	if (m_dup_pkthdr(m_final, m0, how) == 0)
2386 		goto nospace;
2387 
2388 	m_new = m_final;
2389 
2390 	while (progress < m0->m_pkthdr.len) {
2391 		length = m0->m_pkthdr.len - progress;
2392 		if (length > MCLBYTES)
2393 			length = MCLBYTES;
2394 
2395 		if (m_new == NULL) {
2396 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
2397 			if (m_new == NULL)
2398 				goto nospace;
2399 		}
2400 
2401 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
2402 		progress += length;
2403 		m_new->m_len = length;
2404 		if (m_new != m_final)
2405 			m_cat(m_final, m_new);
2406 		m_new = NULL;
2407 	}
2408 	if (m0->m_next == NULL)
2409 		m_defraguseless++;
2410 	m_defragpackets++;
2411 	m_defragbytes += m_final->m_pkthdr.len;
2412 	return (m_final);
2413 nospace:
2414 	m_defragfailure++;
2415 	if (m_new)
2416 		m_free(m_new);
2417 	m_freem(m_final);
2418 	return (NULL);
2419 }
2420 
2421 /*
2422  * Move data from uio into mbufs.
2423  */
2424 struct mbuf *
2425 m_uiomove(struct uio *uio)
2426 {
2427 	struct mbuf *m;			/* current working mbuf */
2428 	struct mbuf *head = NULL;	/* result mbuf chain */
2429 	struct mbuf **mp = &head;
2430 	int flags = M_PKTHDR;
2431 	int nsize;
2432 	int error;
2433 	int resid;
2434 
2435 	do {
2436 		if (uio->uio_resid > INT_MAX)
2437 			resid = INT_MAX;
2438 		else
2439 			resid = (int)uio->uio_resid;
2440 		m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
2441 		if (flags) {
2442 			m->m_pkthdr.len = 0;
2443 			/* Leave room for protocol headers. */
2444 			if (resid < MHLEN)
2445 				MH_ALIGN(m, resid);
2446 			flags = 0;
2447 		}
2448 		m->m_len = imin(nsize, resid);
2449 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
2450 		if (error) {
2451 			m_free(m);
2452 			goto failed;
2453 		}
2454 		*mp = m;
2455 		mp = &m->m_next;
2456 		head->m_pkthdr.len += m->m_len;
2457 	} while (uio->uio_resid > 0);
2458 
2459 	return (head);
2460 
2461 failed:
2462 	m_freem(head);
2463 	return (NULL);
2464 }
2465 
2466 struct mbuf *
2467 m_last(struct mbuf *m)
2468 {
2469 	while (m->m_next)
2470 		m = m->m_next;
2471 	return (m);
2472 }
2473 
2474 /*
2475  * Return the number of bytes in an mbuf chain.
2476  * If lastm is not NULL, also return the last mbuf.
2477  */
2478 u_int
2479 m_lengthm(struct mbuf *m, struct mbuf **lastm)
2480 {
2481 	u_int len = 0;
2482 	struct mbuf *prev = m;
2483 
2484 	while (m) {
2485 		len += m->m_len;
2486 		prev = m;
2487 		m = m->m_next;
2488 	}
2489 	if (lastm != NULL)
2490 		*lastm = prev;
2491 	return (len);
2492 }
2493 
2494 /*
2495  * Like m_lengthm(), except also keep track of mbuf usage.
2496  */
2497 u_int
2498 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
2499 {
2500 	u_int len = 0, mbcnt = 0;
2501 	struct mbuf *prev = m;
2502 
2503 	while (m) {
2504 		len += m->m_len;
2505 		mbcnt += MSIZE;
2506 		if (m->m_flags & M_EXT)
2507 			mbcnt += m->m_ext.ext_size;
2508 		prev = m;
2509 		m = m->m_next;
2510 	}
2511 	if (lastm != NULL)
2512 		*lastm = prev;
2513 	*pmbcnt = mbcnt;
2514 	return (len);
2515 }
2516