xref: /dragonfly/sys/kern/uipc_mbuf.c (revision 27f48495)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
36  *
37  * License terms: all terms for the DragonFly license above plus the following:
38  *
39  * 4. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *
42  *	This product includes software developed by Jeffrey M. Hsu
43  *	for the DragonFly Project.
44  *
45  *    This requirement may be waived with permission from Jeffrey Hsu.
46  *    This requirement will sunset and may be removed on July 8 2005,
47  *    after which the standard DragonFly license (as shown above) will
48  *    apply.
49  */
50 
51 /*
52  * Copyright (c) 1982, 1986, 1988, 1991, 1993
53  *	The Regents of the University of California.  All rights reserved.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  * 3. All advertising materials mentioning features or use of this software
64  *    must display the following acknowledgement:
65  *	This product includes software developed by the University of
66  *	California, Berkeley and its contributors.
67  * 4. Neither the name of the University nor the names of its contributors
68  *    may be used to endorse or promote products derived from this software
69  *    without specific prior written permission.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  *
83  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
84  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
85  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.34 2005/02/20 00:20:43 joerg Exp $
86  */
87 
88 #include "opt_param.h"
89 #include "opt_mbuf_stress_test.h"
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/malloc.h>
93 #include <sys/mbuf.h>
94 #include <sys/kernel.h>
95 #include <sys/sysctl.h>
96 #include <sys/domain.h>
97 #include <sys/protosw.h>
98 #include <sys/uio.h>
99 #include <sys/thread.h>
100 #include <sys/globaldata.h>
101 #include <sys/thread2.h>
102 
103 #include <vm/vm.h>
104 #include <vm/vm_kern.h>
105 #include <vm/vm_extern.h>
106 
107 #ifdef INVARIANTS
108 #include <machine/cpu.h>
109 #endif
110 
111 /*
112  * mbuf cluster meta-data
113  */
114 typedef struct mbcluster {
115 	struct mbcluster *mcl_next;
116 	int32_t	mcl_magic;
117 	int32_t	mcl_refs;
118 	void	*mcl_data;
119 } *mbcluster_t;
120 
121 typedef struct mbuf *mbuf_t;
122 
123 #define MCL_MAGIC	0x6d62636c
124 
125 static void mbinit (void *);
126 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
127 
128 static u_long	mbtypes[MT_NTYPES];
129 
130 struct mbstat mbstat;
131 int	max_linkhdr;
132 int	max_protohdr;
133 int	max_hdr;
134 int	max_datalen;
135 int	m_defragpackets;
136 int	m_defragbytes;
137 int	m_defraguseless;
138 int	m_defragfailure;
139 #ifdef MBUF_STRESS_TEST
140 int	m_defragrandomfailures;
141 #endif
142 
143 int	nmbclusters;
144 int	nmbufs;
145 u_int	m_mballoc_wid = 0;
146 u_int	m_clalloc_wid = 0;
147 
148 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
149 	   &max_linkhdr, 0, "");
150 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
151 	   &max_protohdr, 0, "");
152 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
153 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
154 	   &max_datalen, 0, "");
155 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
156 	   &mbuf_wait, 0, "");
157 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
158 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
159 	   sizeof(mbtypes), "LU", "");
160 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RW,
161 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
162 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RW, &nmbufs, 0,
163 	   "Maximum number of mbufs available");
164 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
165 	   &m_defragpackets, 0, "");
166 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
167 	   &m_defragbytes, 0, "");
168 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
169 	   &m_defraguseless, 0, "");
170 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
171 	   &m_defragfailure, 0, "");
172 #ifdef MBUF_STRESS_TEST
173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
174 	   &m_defragrandomfailures, 0, "");
175 #endif
176 
177 static int mcl_pool_count;
178 static int mcl_pool_max = 20;
179 static int mcl_free_max = 1000;
180 static int mbuf_free_max = 5000;
181 
182 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_max, CTLFLAG_RW, &mcl_pool_max, 0,
183            "Maximum number of mbufs+cluster in free list");
184 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_count, CTLFLAG_RD, &mcl_pool_count, 0,
185            "Current number of mbufs+cluster in free list");
186 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_free_max, CTLFLAG_RW, &mcl_free_max, 0,
187            "Maximum number of clusters on the free list");
188 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_free_max, CTLFLAG_RW, &mbuf_free_max, 0,
189            "Maximum number of mbufs on the free list");
190 
191 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
192 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
193 
194 static mbuf_t mmbfree;
195 static mbcluster_t mclfree;
196 static struct mbuf *mcl_pool;
197 
198 static void m_reclaim (void);
199 static int m_mballoc(int nmb, int how);
200 static int m_clalloc(int ncl, int how);
201 static struct mbuf *m_mballoc_wait(int caller, int type);
202 static void m_mclref(void *arg);
203 static void m_mclfree(void *arg);
204 
205 #ifndef NMBCLUSTERS
206 #define NMBCLUSTERS	(512 + maxusers * 16)
207 #endif
208 #ifndef NMBUFS
209 #define NMBUFS		(nmbclusters * 4)
210 #endif
211 
212 /*
213  * Perform sanity checks of tunables declared above.
214  */
215 static void
216 tunable_mbinit(void *dummy)
217 {
218 
219 	/*
220 	 * This has to be done before VM init.
221 	 */
222 	nmbclusters = NMBCLUSTERS;
223 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
224 	nmbufs = NMBUFS;
225 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
226 	/* Sanity checks */
227 	if (nmbufs < nmbclusters * 2)
228 		nmbufs = nmbclusters * 2;
229 
230 	return;
231 }
232 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
233 
234 /* "number of clusters of pages" */
235 #define NCL_INIT	1
236 
237 #define NMB_INIT	16
238 
239 /* ARGSUSED*/
240 static void
241 mbinit(void *dummy)
242 {
243 	mmbfree = NULL;
244 	mclfree = NULL;
245 	mbstat.m_msize = MSIZE;
246 	mbstat.m_mclbytes = MCLBYTES;
247 	mbstat.m_minclsize = MINCLSIZE;
248 	mbstat.m_mlen = MLEN;
249 	mbstat.m_mhlen = MHLEN;
250 
251 	crit_enter();
252 	if (m_mballoc(NMB_INIT, MB_DONTWAIT) == 0)
253 		goto bad;
254 #if MCLBYTES <= PAGE_SIZE
255 	if (m_clalloc(NCL_INIT, MB_DONTWAIT) == 0)
256 		goto bad;
257 #else
258 	/* It's OK to call contigmalloc in this context. */
259 	if (m_clalloc(16, MB_WAIT) == 0)
260 		goto bad;
261 #endif
262 	crit_exit();
263 	return;
264 bad:
265 	crit_exit();
266 	panic("mbinit");
267 }
268 
269 /*
270  * Allocate at least nmb mbufs and place on mbuf free list.
271  * Returns the number of mbufs successfully allocated, 0 if none.
272  *
273  * Must be called while in a critical section.
274  */
275 static int
276 m_mballoc(int nmb, int how)
277 {
278 	int i;
279 	struct mbuf *m;
280 
281 	/*
282 	 * If we've hit the mbuf limit, stop allocating (or trying to)
283 	 * in order to avoid exhausting kernel memory entirely.
284 	 */
285 	if ((nmb + mbstat.m_mbufs) > nmbufs)
286 		return (0);
287 
288 	/*
289 	 * Attempt to allocate the requested number of mbufs, terminate when
290 	 * the allocation fails but if blocking is allowed allocate at least
291 	 * one.
292 	 */
293 	for (i = 0; i < nmb; ++i) {
294 		m = malloc(MSIZE, M_MBUF, M_NOWAIT|M_NULLOK|M_ZERO);
295 		if (m == NULL) {
296 			if (how == MB_WAIT) {
297 				mbstat.m_wait++;
298 				m = malloc(MSIZE, M_MBUF,
299 					    M_WAITOK|M_NULLOK|M_ZERO);
300 			}
301 			if (m == NULL)
302 				break;
303 		}
304 		m->m_next = mmbfree;
305 		mmbfree = m;
306 		++mbstat.m_mbufs;
307 		++mbtypes[MT_FREE];
308 		how = MB_DONTWAIT;
309 	}
310 	return(i);
311 }
312 
313 /*
314  * Once mbuf memory has been exhausted and if the call to the allocation macros
315  * (or, in some cases, functions) is with MB_WAIT, then it is necessary to rely
316  * solely on reclaimed mbufs. Here we wait for an mbuf to be freed for a
317  * designated (mbuf_wait) time.
318  */
319 static struct mbuf *
320 m_mballoc_wait(int caller, int type)
321 {
322 	struct mbuf *m;
323 
324 	crit_enter();
325 	m_mballoc_wid++;
326 	if ((tsleep(&m_mballoc_wid, 0, "mballc", mbuf_wait)) == EWOULDBLOCK)
327 		m_mballoc_wid--;
328 	crit_exit();
329 
330 	/*
331 	 * Now that we (think) that we've got something, we will redo an
332 	 * MGET, but avoid getting into another instance of m_mballoc_wait()
333 	 * XXX: We retry to fetch _even_ if the sleep timed out. This is left
334 	 *      this way, purposely, in the [unlikely] case that an mbuf was
335 	 *      freed but the sleep was not awakened in time.
336 	 */
337 	m = NULL;
338 	switch (caller) {
339 	case MGET_C:
340 		MGET(m, MB_DONTWAIT, type);
341 		break;
342 	case MGETHDR_C:
343 		MGETHDR(m, MB_DONTWAIT, type);
344 		break;
345 	default:
346 		panic("m_mballoc_wait: invalid caller (%d)", caller);
347 	}
348 
349 	crit_enter();
350 	if (m != NULL) {		/* We waited and got something... */
351 		mbstat.m_wait++;
352 		/* Wake up another if we have more free. */
353 		if (mmbfree != NULL)
354 			MMBWAKEUP();
355 	}
356 	crit_exit();
357 	return (m);
358 }
359 
360 #if MCLBYTES > PAGE_SIZE
361 static int i_want_my_mcl;
362 
363 static void
364 kproc_mclalloc(void)
365 {
366 	int status;
367 
368 	crit_enter();
369 	for (;;) {
370 		tsleep(&i_want_my_mcl, 0, "mclalloc", 0);
371 
372 		while (i_want_my_mcl > 0) {
373 			if (m_clalloc(1, MB_WAIT) == 0)
374 				printf("m_clalloc failed even in thread context!\n");
375 			--i_want_my_mcl;
376 		}
377 	}
378 	/* not reached */
379 	crit_exit();
380 }
381 
382 static struct thread *mclallocthread;
383 static struct kproc_desc mclalloc_kp = {
384 	"mclalloc",
385 	kproc_mclalloc,
386 	&mclallocthread
387 };
388 SYSINIT(mclallocthread, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
389 	   &mclalloc_kp);
390 #endif
391 
392 /*
393  * Allocate at least nmb mbuf clusters and place on mbuf free list.
394  * Returns the number of mbuf clusters successfully allocated, 0 if none.
395  *
396  * Must be called while in a critical section.
397  */
398 static int
399 m_clalloc(int ncl, int how)
400 {
401 	static int last_report;
402 	mbcluster_t mcl;
403 	void *data;
404 	int i;
405 
406 	/*
407 	 * If we've hit the mbuf cluster limit, stop allocating (or trying to).
408 	 */
409 	if ((ncl + mbstat.m_clusters) > nmbclusters)
410 		ncl = 0;
411 
412 	/*
413 	 * Attempt to allocate the requested number of mbuf clusters,
414 	 * terminate when the allocation fails but if blocking is allowed
415 	 * allocate at least one.
416 	 *
417 	 * We need to allocate two structures for each cluster... a
418 	 * ref counting / governing structure and the actual data.  MCLBYTES
419 	 * should be a power of 2 which means that the slab allocator will
420 	 * return a buffer that does not cross a page boundary.
421 	 */
422 	for (i = 0; i < ncl; ++i) {
423 		/*
424 		 * Meta structure
425 		 */
426 		mcl = malloc(sizeof(*mcl), M_MBUFCL, M_NOWAIT|M_NULLOK|M_ZERO);
427 		if (mcl == NULL) {
428 			if (how == MB_WAIT) {
429 				mbstat.m_wait++;
430 				mcl = malloc(sizeof(*mcl),
431 					    M_MBUFCL, M_WAITOK|M_NULLOK|M_ZERO);
432 			}
433 			if (mcl == NULL)
434 				break;
435 		}
436 
437 		/*
438 		 * Physically contiguous data buffer.
439 		 */
440 #if MCLBYTES > PAGE_SIZE
441 		if (how != MB_WAIT) {
442 			i_want_my_mcl += ncl - i;
443 			wakeup(&i_want_my_mcl);
444 			mbstat.m_wait++;
445 			data = NULL;
446 		} else {
447 			data = contigmalloc_map(MCLBYTES, M_MBUFCL,
448 				M_WAITOK, 0ul, ~0ul, PAGE_SIZE, 0, kernel_map);
449 		}
450 #else
451 		data = malloc(MCLBYTES, M_MBUFCL, M_NOWAIT|M_NULLOK);
452 		if (data == NULL) {
453 			if (how == MB_WAIT) {
454 				mbstat.m_wait++;
455 				data = malloc(MCLBYTES, M_MBUFCL,
456 						M_WAITOK|M_NULLOK);
457 			}
458 		}
459 #endif
460 		if (data == NULL) {
461 			free(mcl, M_MBUFCL);
462 			break;
463 		}
464 		mcl->mcl_next = mclfree;
465 		mcl->mcl_data = data;
466 		mcl->mcl_magic = MCL_MAGIC;
467 		mcl->mcl_refs = 0;
468 		mclfree = mcl;
469 		++mbstat.m_clfree;
470 		++mbstat.m_clusters;
471 		how = MB_DONTWAIT;
472 	}
473 
474 	/*
475 	 * If we could not allocate any report failure no more often then
476 	 * once a second.
477 	 */
478 	if (i == 0) {
479 		mbstat.m_drops++;
480 		if (ticks < last_report || (ticks - last_report) >= hz) {
481 			last_report = ticks;
482 			printf("All mbuf clusters exhausted, please see tuning(7).\n");
483 		}
484 	}
485 	return (i);
486 }
487 
488 /*
489  * Once cluster memory has been exhausted and the allocation is called with
490  * MB_WAIT, we rely on the mclfree pointers. If nothing is free, we will
491  * sleep for a designated amount of time (mbuf_wait) or until we're woken up
492  * due to sudden mcluster availability.
493  *
494  * Must be called while in a critical section.
495  */
496 static void
497 m_clalloc_wait(void)
498 {
499 	/* If in interrupt context, and INVARIANTS, maintain sanity and die. */
500 	KASSERT(mycpu->gd_intr_nesting_level == 0,
501 		("CLALLOC: CANNOT WAIT IN INTERRUPT"));
502 
503 	/*
504 	 * Sleep until something's available or until we expire.
505 	 */
506 	m_clalloc_wid++;
507 	if ((tsleep(&m_clalloc_wid, 0, "mclalc", mbuf_wait)) == EWOULDBLOCK)
508 		m_clalloc_wid--;
509 
510 	/*
511 	 * Try the allocation once more, and if we see mor then two
512 	 * free entries wake up others as well.
513 	 */
514 	m_clalloc(1, MB_WAIT);
515 	if (mclfree && mclfree->mcl_next) {
516 		MCLWAKEUP();
517 	}
518 }
519 
520 /*
521  * Return the number of references to this mbuf's data.  0 is returned
522  * if the mbuf is not M_EXT, a reference count is returned if it is
523  * M_EXT|M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
524  */
525 int
526 m_sharecount(struct mbuf *m)
527 {
528     int count;
529 
530     switch(m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
531     case 0:
532 	count = 0;
533 	break;
534     case M_EXT:
535 	count = 99;
536 	break;
537     case M_EXT|M_EXT_CLUSTER:
538 	count = ((mbcluster_t)m->m_ext.ext_arg)->mcl_refs;
539 	break;
540     default:
541 	panic("bad mbuf flags: %p", m);
542 	count = 0;
543     }
544     return(count);
545 }
546 
547 /*
548  * change mbuf to new type
549  */
550 void
551 m_chtype(struct mbuf *m, int type)
552 {
553 	crit_enter();
554 	--mbtypes[m->m_type];
555 	++mbtypes[type];
556 	m->m_type = type;
557 	crit_exit();
558 }
559 
560 /*
561  * When MGET fails, ask protocols to free space when short of memory,
562  * then re-attempt to allocate an mbuf.
563  */
564 struct mbuf *
565 m_retry(int how, int t)
566 {
567 	struct mbuf *m;
568 
569 	/*
570 	 * Must only do the reclaim if not in an interrupt context.
571 	 */
572 	if (how == MB_WAIT) {
573 		KASSERT(mycpu->gd_intr_nesting_level == 0,
574 		    ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
575 		m_reclaim();
576 	}
577 
578 	/*
579 	 * Try to pull a new mbuf out of the cache, if the cache is empty
580 	 * try to allocate a new one and if that doesn't work we give up.
581 	 */
582 	crit_enter();
583 	if ((m = mmbfree) == NULL) {
584 		m_mballoc(1, how);
585 		if ((m = mmbfree) == NULL) {
586 			static int last_report;
587 
588 			mbstat.m_drops++;
589 			crit_exit();
590 			if (ticks < last_report ||
591 			    (ticks - last_report) >= hz) {
592 				last_report = ticks;
593 				printf("All mbufs exhausted, please see tuning(7).\n");
594 			}
595 			return (NULL);
596 		}
597 	}
598 
599 	/*
600 	 * Cache case, adjust globals before leaving the critical section
601 	 */
602 	mmbfree = m->m_next;
603 	mbtypes[MT_FREE]--;
604 	mbtypes[t]++;
605 	mbstat.m_wait++;
606 	crit_exit();
607 
608 	m->m_type = t;
609 	m->m_next = NULL;
610 	m->m_nextpkt = NULL;
611 	m->m_data = m->m_dat;
612 	m->m_flags = 0;
613 	return (m);
614 }
615 
616 /*
617  * As above; retry an MGETHDR.
618  */
619 struct mbuf *
620 m_retryhdr(int how, int t)
621 {
622 	struct mbuf *m;
623 
624 	/*
625 	 * Must only do the reclaim if not in an interrupt context.
626 	 */
627 	if (how == MB_WAIT) {
628 		KASSERT(mycpu->gd_intr_nesting_level == 0,
629 		    ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
630 		m_reclaim();
631 	}
632 
633 	/*
634 	 * Try to pull a new mbuf out of the cache, if the cache is empty
635 	 * try to allocate a new one and if that doesn't work we give up.
636 	 */
637 	crit_enter();
638 	if ((m = mmbfree) == NULL) {
639 		m_mballoc(1, how);
640 		if ((m = mmbfree) == NULL) {
641 			static int last_report;
642 
643 			mbstat.m_drops++;
644 			crit_exit();
645 			if (ticks < last_report ||
646 			    (ticks - last_report) >= hz) {
647 				last_report = ticks;
648 				printf("All mbufs exhausted, please see tuning(7).\n");
649 			}
650 			return (NULL);
651 		}
652 	}
653 
654 	/*
655 	 * Cache case, adjust globals before leaving the critical section
656 	 */
657 	mmbfree = m->m_next;
658 	mbtypes[MT_FREE]--;
659 	mbtypes[t]++;
660 	mbstat.m_wait++;
661 	crit_exit();
662 
663 	m->m_type = t;
664 	m->m_next = NULL;
665 	m->m_nextpkt = NULL;
666 	m->m_data = m->m_pktdat;
667 	m->m_flags = M_PKTHDR;
668 	m->m_pkthdr.rcvif = NULL;
669 	SLIST_INIT(&m->m_pkthdr.tags);
670 	m->m_pkthdr.csum_flags = 0;
671 	return (m);
672 }
673 
674 static void
675 m_reclaim(void)
676 {
677 	struct domain *dp;
678 	struct protosw *pr;
679 
680 	crit_enter();
681 	for (dp = domains; dp; dp = dp->dom_next) {
682 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
683 			if (pr->pr_drain)
684 				(*pr->pr_drain)();
685 		}
686 	}
687 	crit_exit();
688 	mbstat.m_drain++;
689 }
690 
691 /*
692  * Allocate an mbuf.  If no mbufs are immediately available try to
693  * bring a bunch more into our cache (mmbfree list).  A critical
694  * section is required to protect the mmbfree list and counters
695  * against interrupts.
696  */
697 struct mbuf *
698 m_get(int how, int type)
699 {
700 	struct mbuf *m;
701 
702 	/*
703 	 * Try to pull a new mbuf out of the cache, if the cache is empty
704 	 * try to allocate a new one and if that doesn't work try even harder
705 	 * by calling m_retryhdr().
706 	 */
707 	crit_enter();
708 	if ((m = mmbfree) == NULL) {
709 		m_mballoc(1, how);
710 		if ((m = mmbfree) == NULL) {
711 			crit_exit();
712 			m = m_retry(how, type);
713 			if (m == NULL && how == MB_WAIT)
714 				m = m_mballoc_wait(MGET_C, type);
715 			return (m);
716 		}
717 	}
718 
719 	/*
720 	 * Cache case, adjust globals before leaving the critical section
721 	 */
722 	mmbfree = m->m_next;
723 	mbtypes[MT_FREE]--;
724 	mbtypes[type]++;
725 	crit_exit();
726 
727 	m->m_type = type;
728 	m->m_next = NULL;
729 	m->m_nextpkt = NULL;
730 	m->m_data = m->m_dat;
731 	m->m_flags = 0;
732 	return (m);
733 }
734 
735 struct mbuf *
736 m_gethdr(int how, int type)
737 {
738 	struct mbuf *m;
739 
740 	/*
741 	 * Try to pull a new mbuf out of the cache, if the cache is empty
742 	 * try to allocate a new one and if that doesn't work try even harder
743 	 * by calling m_retryhdr().
744 	 */
745 	crit_enter();
746 	if ((m = mmbfree) == NULL) {
747 		m_mballoc(1, how);
748 		if ((m = mmbfree) == NULL) {
749 			crit_exit();
750 			m = m_retryhdr(how, type);
751 			if (m == NULL && how == MB_WAIT)
752 				m = m_mballoc_wait(MGETHDR_C, type);
753 			return(m);
754 		}
755 	}
756 
757 	/*
758 	 * Cache case, adjust globals before leaving the critical section
759 	 */
760 	mmbfree = m->m_next;
761 	mbtypes[MT_FREE]--;
762 	mbtypes[type]++;
763 	crit_exit();
764 
765 	m->m_type = type;
766 	m->m_next = NULL;
767 	m->m_nextpkt = NULL;
768 	m->m_data = m->m_pktdat;
769 	m->m_flags = M_PKTHDR;
770 	m->m_pkthdr.rcvif = NULL;
771 	SLIST_INIT(&m->m_pkthdr.tags);
772 	m->m_pkthdr.csum_flags = 0;
773 	m->m_pkthdr.fw_flags = 0;
774 	return (m);
775 }
776 
777 struct mbuf *
778 m_getclr(int how, int type)
779 {
780 	struct mbuf *m;
781 
782 	if ((m = m_get(how, type)) != NULL) {
783 		bzero(mtod(m, caddr_t), MLEN);
784 	}
785 	return (m);
786 }
787 
788 /*
789  * m_getcl() returns an mbuf with an attached cluster.
790  * Because many network drivers use this kind of buffers a lot, it is
791  * convenient to keep a small pool of free buffers of this kind.
792  * Even a small size such as 10 gives about 10% improvement in the
793  * forwarding rate in a bridge or router.
794  * The size of this free list is controlled by the sysctl variable
795  * mcl_pool_max. The list is populated on m_freem(), and used in
796  * m_getcl() if elements are available.
797  */
798 struct mbuf *
799 m_getcl(int how, short type, int flags)
800 {
801 	struct mbuf *mp;
802 
803 	crit_enter();
804 	if (flags & M_PKTHDR) {
805 		if (type == MT_DATA && mcl_pool) {
806 			mp = mcl_pool;
807 			mcl_pool = mp->m_nextpkt;
808 			--mcl_pool_count;
809 			crit_exit();
810 			mp->m_nextpkt = NULL;
811 			mp->m_data = mp->m_ext.ext_buf;
812 			mp->m_flags = M_PKTHDR|M_EXT|M_EXT_CLUSTER;
813 			mp->m_pkthdr.rcvif = NULL;
814 			mp->m_pkthdr.csum_flags = 0;
815 			return mp;
816 		}
817 		MGETHDR(mp, how, type);
818 	} else {
819 		MGET(mp, how, type);
820 	}
821 	if (mp) {
822 		m_mclget(mp, how);
823 		if ((mp->m_flags & M_EXT) == 0) {
824 			m_free(mp);
825 			mp = NULL;
826 		}
827 	}
828 	crit_exit();
829 	return (mp);
830 }
831 
832 /*
833  * struct mbuf *
834  * m_getm(m, len, how, type)
835  *
836  * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
837  * best) and return a pointer to the top of the allocated chain. If m is
838  * non-null, then we assume that it is a single mbuf or an mbuf chain to
839  * which we want len bytes worth of mbufs and/or clusters attached, and so
840  * if we succeed in allocating it, we will just return a pointer to m.
841  *
842  * If we happen to fail at any point during the allocation, we will free
843  * up everything we have already allocated and return NULL.
844  *
845  */
846 struct mbuf *
847 m_getm(struct mbuf *m, int len, int how, int type)
848 {
849 	struct mbuf *top, *tail, *mp, *mtail = NULL;
850 
851 	KASSERT(len >= 0, ("len is < 0 in m_getm"));
852 
853 	mp = m_get(how, type);
854 	if (mp == NULL) {
855 		return (NULL);
856 	} else if (len > MINCLSIZE) {
857 		m_mclget(mp, how);
858 		if ((mp->m_flags & M_EXT) == 0) {
859 			m_free(mp);
860 			return (NULL);
861 		}
862 	}
863 	mp->m_len = 0;
864 	len -= M_TRAILINGSPACE(mp);
865 
866 	if (m != NULL) {
867 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
868 			;
869 	} else {
870 		m = mp;
871 	}
872 
873 	top = tail = mp;
874 	while (len > 0) {
875 		mp = m_get(how, type);
876 		if (mp == NULL)
877 			goto failed;
878 
879 		tail->m_next = mp;
880 		tail = mp;
881 		if (len > MINCLSIZE) {
882 			m_mclget(mp, how);
883 			if ((mp->m_flags & M_EXT) == 0)
884 				goto failed;
885 		}
886 
887 		mp->m_len = 0;
888 		len -= M_TRAILINGSPACE(mp);
889 	}
890 
891 	if (mtail != NULL)
892 		mtail->m_next = top;
893 	return (m);
894 failed:
895 	m_freem(top);
896 	return (NULL);
897 }
898 
899 /*
900  *  m_mclget() - Adds a cluster to a normal mbuf, M_EXT is set on success.
901  */
902 void
903 m_mclget(struct mbuf *m, int how)
904 {
905 	mbcluster_t mcl;
906 
907 	KKASSERT((m->m_flags & M_EXT_OLD) == 0);
908 
909 	/*
910 	 * Allocate a cluster, return if we can't get one.
911 	 */
912 	crit_enter();
913 	if ((mcl = mclfree) == NULL) {
914 		m_clalloc(1, how);
915 		if ((mcl = mclfree) == NULL) {
916 			if (how == MB_WAIT) {
917 				m_clalloc_wait();
918 				mcl = mclfree;
919 			}
920 			if (mcl == NULL) {
921 				crit_exit();
922 				return;
923 			}
924 		}
925 	}
926 
927 	/*
928 	 * We have a cluster, unlink it from the free list and set the ref
929 	 * count.
930 	 */
931 	KKASSERT(mcl->mcl_refs == 0);
932 	mclfree = mcl->mcl_next;
933 	mcl->mcl_refs = 1;
934 	--mbstat.m_clfree;
935 	crit_exit();
936 
937 	/*
938 	 * Add the cluster to the mbuf.  The caller will detect that the
939 	 * mbuf now has an attached cluster.
940 	 */
941 	m->m_ext.ext_arg = mcl;
942 	m->m_ext.ext_buf = mcl->mcl_data;
943 	m->m_ext.ext_nref.new = m_mclref;
944 	m->m_ext.ext_nfree.new = m_mclfree;
945 	m->m_ext.ext_size = MCLBYTES;
946 
947 	m->m_data = m->m_ext.ext_buf;
948 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
949 }
950 
951 static void
952 m_mclfree(void *arg)
953 {
954 	mbcluster_t mcl = arg;
955 
956 	KKASSERT(mcl->mcl_magic == MCL_MAGIC);
957 	KKASSERT(mcl->mcl_refs > 0);
958 	crit_enter();
959 	if (--mcl->mcl_refs == 0) {
960 		if (mbstat.m_clfree < mcl_free_max) {
961 			mcl->mcl_next = mclfree;
962 			mclfree = mcl;
963 			++mbstat.m_clfree;
964 			MCLWAKEUP();
965 		} else {
966 			mcl->mcl_magic = -1;
967 			free(mcl->mcl_data, M_MBUFCL);
968 			free(mcl, M_MBUFCL);
969 			--mbstat.m_clusters;
970 		}
971 	}
972 	crit_exit();
973 }
974 
975 static void
976 m_mclref(void *arg)
977 {
978 	mbcluster_t mcl = arg;
979 
980 	KKASSERT(mcl->mcl_magic == MCL_MAGIC);
981 	crit_enter();
982 	++mcl->mcl_refs;
983 	crit_exit();
984 }
985 
986 /*
987  * Helper routines for M_EXT reference/free
988  */
989 static __inline void
990 m_extref(const struct mbuf *m)
991 {
992 	KKASSERT(m->m_ext.ext_nfree.any != NULL);
993 	crit_enter();
994 	if (m->m_flags & M_EXT_OLD)
995 		m->m_ext.ext_nref.old(m->m_ext.ext_buf, m->m_ext.ext_size);
996 	else
997 		m->m_ext.ext_nref.new(m->m_ext.ext_arg);
998 	crit_exit();
999 }
1000 
1001 /*
1002  * m_free()
1003  *
1004  * Free a single mbuf and any associated external storage.  The successor,
1005  * if any, is returned.
1006  *
1007  * We do need to check non-first mbuf for m_aux, since some of existing
1008  * code does not call M_PREPEND properly.
1009  * (example: call to bpf_mtap from drivers)
1010  */
1011 struct mbuf *
1012 m_free(struct mbuf *m)
1013 {
1014 	struct mbuf *n;
1015 
1016 	crit_enter();
1017 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
1018 
1019 	/*
1020 	 * Adjust our type count and delete any attached chains if the
1021 	 * mbuf is a packet header.
1022 	 */
1023 	if ((m->m_flags & M_PKTHDR) != 0)
1024 		m_tag_delete_chain(m, NULL);
1025 
1026 	/*
1027 	 * Place the mbuf on the appropriate free list.  Try to maintain a
1028 	 * small cache of mbuf+cluster pairs.
1029 	 */
1030 	n = m->m_next;
1031 	m->m_next = NULL;
1032 	if (m->m_flags & M_EXT) {
1033 		KKASSERT(m->m_ext.ext_nfree.any != NULL);
1034 		if (mcl_pool_count < mcl_pool_max && m && m->m_next == NULL &&
1035 		    (m->m_flags & (M_PKTHDR|M_EXT_CLUSTER)) == (M_PKTHDR|M_EXT_CLUSTER) &&
1036 		    m->m_type == MT_DATA && M_EXT_WRITABLE(m) ) {
1037 			KKASSERT(((mbcluster_t)m->m_ext.ext_arg)->mcl_magic == MCL_MAGIC);
1038 			m->m_nextpkt = mcl_pool;
1039 			mcl_pool = m;
1040 			++mcl_pool_count;
1041 			m = NULL;
1042 		} else {
1043 			if (m->m_flags & M_EXT_OLD)
1044 				m->m_ext.ext_nfree.old(m->m_ext.ext_buf, m->m_ext.ext_size);
1045 			else
1046 				m->m_ext.ext_nfree.new(m->m_ext.ext_arg);
1047 			m->m_flags = 0;
1048 			m->m_ext.ext_arg = NULL;
1049 			m->m_ext.ext_nref.new = NULL;
1050 			m->m_ext.ext_nfree.new = NULL;
1051 		}
1052 	}
1053 	if (m) {
1054 		--mbtypes[m->m_type];
1055 		if (mbtypes[MT_FREE] < mbuf_free_max) {
1056 			m->m_type = MT_FREE;
1057 			mbtypes[MT_FREE]++;
1058 			m->m_next = mmbfree;
1059 			mmbfree = m;
1060 			MMBWAKEUP();
1061 		} else {
1062 			free(m, M_MBUF);
1063 			--mbstat.m_mbufs;
1064 		}
1065 	}
1066 	crit_exit();
1067 	return (n);
1068 }
1069 
1070 void
1071 m_freem(struct mbuf *m)
1072 {
1073 	crit_enter();
1074 	while (m)
1075 		m = m_free(m);
1076 	crit_exit();
1077 }
1078 
1079 /*
1080  * Mbuffer utility routines.
1081  */
1082 
1083 /*
1084  * Lesser-used path for M_PREPEND:
1085  * allocate new mbuf to prepend to chain,
1086  * copy junk along.
1087  */
1088 struct mbuf *
1089 m_prepend(struct mbuf *m, int len, int how)
1090 {
1091 	struct mbuf *mn;
1092 
1093 	MGET(mn, how, m->m_type);
1094 	if (mn == (struct mbuf *)NULL) {
1095 		m_freem(m);
1096 		return ((struct mbuf *)NULL);
1097 	}
1098 	if (m->m_flags & M_PKTHDR)
1099 		M_MOVE_PKTHDR(mn, m);
1100 	mn->m_next = m;
1101 	m = mn;
1102 	if (len < MHLEN)
1103 		MH_ALIGN(m, len);
1104 	m->m_len = len;
1105 	return (m);
1106 }
1107 
1108 /*
1109  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1110  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1111  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1112  * Note that the copy is read-only, because clusters are not copied,
1113  * only their reference counts are incremented.
1114  */
1115 #define MCFail (mbstat.m_mcfail)
1116 
1117 struct mbuf *
1118 m_copym(const struct mbuf *m, int off0, int len, int wait)
1119 {
1120 	struct mbuf *n, **np;
1121 	int off = off0;
1122 	struct mbuf *top;
1123 	int copyhdr = 0;
1124 
1125 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1126 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1127 	if (off == 0 && m->m_flags & M_PKTHDR)
1128 		copyhdr = 1;
1129 	while (off > 0) {
1130 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1131 		if (off < m->m_len)
1132 			break;
1133 		off -= m->m_len;
1134 		m = m->m_next;
1135 	}
1136 	np = &top;
1137 	top = 0;
1138 	while (len > 0) {
1139 		if (m == 0) {
1140 			KASSERT(len == M_COPYALL,
1141 			    ("m_copym, length > size of mbuf chain"));
1142 			break;
1143 		}
1144 		MGET(n, wait, m->m_type);
1145 		*np = n;
1146 		if (n == 0)
1147 			goto nospace;
1148 		if (copyhdr) {
1149 			if (!m_dup_pkthdr(n, m, wait))
1150 				goto nospace;
1151 			if (len == M_COPYALL)
1152 				n->m_pkthdr.len -= off0;
1153 			else
1154 				n->m_pkthdr.len = len;
1155 			copyhdr = 0;
1156 		}
1157 		n->m_len = min(len, m->m_len - off);
1158 		if (m->m_flags & M_EXT) {
1159 			n->m_data = m->m_data + off;
1160 			m_extref(m);
1161 			n->m_ext = m->m_ext;
1162 			n->m_flags |= m->m_flags &
1163 					(M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1164 		} else {
1165 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1166 			    (unsigned)n->m_len);
1167 		}
1168 		if (len != M_COPYALL)
1169 			len -= n->m_len;
1170 		off = 0;
1171 		m = m->m_next;
1172 		np = &n->m_next;
1173 	}
1174 	if (top == 0)
1175 		MCFail++;
1176 	return (top);
1177 nospace:
1178 	m_freem(top);
1179 	MCFail++;
1180 	return (0);
1181 }
1182 
1183 /*
1184  * Copy an entire packet, including header (which must be present).
1185  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1186  * Note that the copy is read-only, because clusters are not copied,
1187  * only their reference counts are incremented.
1188  * Preserve alignment of the first mbuf so if the creator has left
1189  * some room at the beginning (e.g. for inserting protocol headers)
1190  * the copies also have the room available.
1191  */
1192 struct mbuf *
1193 m_copypacket(struct mbuf *m, int how)
1194 {
1195 	struct mbuf *top, *n, *o;
1196 
1197 	MGET(n, how, m->m_type);
1198 	top = n;
1199 	if (!n)
1200 		goto nospace;
1201 
1202 	if (!m_dup_pkthdr(n, m, how))
1203 		goto nospace;
1204 	n->m_len = m->m_len;
1205 	if (m->m_flags & M_EXT) {
1206 		n->m_data = m->m_data;
1207 		m_extref(m);
1208 		n->m_ext = m->m_ext;
1209 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1210 	} else {
1211 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1212 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1213 	}
1214 
1215 	m = m->m_next;
1216 	while (m) {
1217 		MGET(o, how, m->m_type);
1218 		if (!o)
1219 			goto nospace;
1220 
1221 		n->m_next = o;
1222 		n = n->m_next;
1223 
1224 		n->m_len = m->m_len;
1225 		if (m->m_flags & M_EXT) {
1226 			n->m_data = m->m_data;
1227 			m_extref(m);
1228 			n->m_ext = m->m_ext;
1229 			n->m_flags |= m->m_flags &
1230 					 (M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1231 		} else {
1232 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1233 		}
1234 
1235 		m = m->m_next;
1236 	}
1237 	return top;
1238 nospace:
1239 	m_freem(top);
1240 	MCFail++;
1241 	return 0;
1242 }
1243 
1244 /*
1245  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1246  * continuing for "len" bytes, into the indicated buffer.
1247  */
1248 void
1249 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1250 {
1251 	unsigned count;
1252 
1253 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1254 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1255 	while (off > 0) {
1256 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1257 		if (off < m->m_len)
1258 			break;
1259 		off -= m->m_len;
1260 		m = m->m_next;
1261 	}
1262 	while (len > 0) {
1263 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1264 		count = min(m->m_len - off, len);
1265 		bcopy(mtod(m, caddr_t) + off, cp, count);
1266 		len -= count;
1267 		cp += count;
1268 		off = 0;
1269 		m = m->m_next;
1270 	}
1271 }
1272 
1273 /*
1274  * Copy a packet header mbuf chain into a completely new chain, including
1275  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1276  * you need a writable copy of an mbuf chain.
1277  */
1278 struct mbuf *
1279 m_dup(struct mbuf *m, int how)
1280 {
1281 	struct mbuf **p, *top = NULL;
1282 	int remain, moff, nsize;
1283 
1284 	/* Sanity check */
1285 	if (m == NULL)
1286 		return (0);
1287 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1288 
1289 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1290 	remain = m->m_pkthdr.len;
1291 	moff = 0;
1292 	p = &top;
1293 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1294 		struct mbuf *n;
1295 
1296 		/* Get the next new mbuf */
1297 		MGET(n, how, m->m_type);
1298 		if (n == NULL)
1299 			goto nospace;
1300 		if (top == NULL) {		/* first one, must be PKTHDR */
1301 			if (!m_dup_pkthdr(n, m, how))
1302 				goto nospace;
1303 			nsize = MHLEN;
1304 		} else				/* not the first one */
1305 			nsize = MLEN;
1306 		if (remain >= MINCLSIZE) {
1307 			MCLGET(n, how);
1308 			if ((n->m_flags & M_EXT) == 0) {
1309 				(void)m_free(n);
1310 				goto nospace;
1311 			}
1312 			nsize = MCLBYTES;
1313 		}
1314 		n->m_len = 0;
1315 
1316 		/* Link it into the new chain */
1317 		*p = n;
1318 		p = &n->m_next;
1319 
1320 		/* Copy data from original mbuf(s) into new mbuf */
1321 		while (n->m_len < nsize && m != NULL) {
1322 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1323 
1324 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1325 			moff += chunk;
1326 			n->m_len += chunk;
1327 			remain -= chunk;
1328 			if (moff == m->m_len) {
1329 				m = m->m_next;
1330 				moff = 0;
1331 			}
1332 		}
1333 
1334 		/* Check correct total mbuf length */
1335 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1336 		    	("%s: bogus m_pkthdr.len", __func__));
1337 	}
1338 	return (top);
1339 
1340 nospace:
1341 	m_freem(top);
1342 	MCFail++;
1343 	return (0);
1344 }
1345 
1346 /*
1347  * Concatenate mbuf chain n to m.
1348  * Both chains must be of the same type (e.g. MT_DATA).
1349  * Any m_pkthdr is not updated.
1350  */
1351 void
1352 m_cat(struct mbuf *m, struct mbuf *n)
1353 {
1354 	while (m->m_next)
1355 		m = m->m_next;
1356 	while (n) {
1357 		if (m->m_flags & M_EXT ||
1358 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1359 			/* just join the two chains */
1360 			m->m_next = n;
1361 			return;
1362 		}
1363 		/* splat the data from one into the other */
1364 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1365 		    (u_int)n->m_len);
1366 		m->m_len += n->m_len;
1367 		n = m_free(n);
1368 	}
1369 }
1370 
1371 void
1372 m_adj(struct mbuf *mp, int req_len)
1373 {
1374 	int len = req_len;
1375 	struct mbuf *m;
1376 	int count;
1377 
1378 	if ((m = mp) == NULL)
1379 		return;
1380 	if (len >= 0) {
1381 		/*
1382 		 * Trim from head.
1383 		 */
1384 		while (m != NULL && len > 0) {
1385 			if (m->m_len <= len) {
1386 				len -= m->m_len;
1387 				m->m_len = 0;
1388 				m = m->m_next;
1389 			} else {
1390 				m->m_len -= len;
1391 				m->m_data += len;
1392 				len = 0;
1393 			}
1394 		}
1395 		m = mp;
1396 		if (mp->m_flags & M_PKTHDR)
1397 			m->m_pkthdr.len -= (req_len - len);
1398 	} else {
1399 		/*
1400 		 * Trim from tail.  Scan the mbuf chain,
1401 		 * calculating its length and finding the last mbuf.
1402 		 * If the adjustment only affects this mbuf, then just
1403 		 * adjust and return.  Otherwise, rescan and truncate
1404 		 * after the remaining size.
1405 		 */
1406 		len = -len;
1407 		count = 0;
1408 		for (;;) {
1409 			count += m->m_len;
1410 			if (m->m_next == (struct mbuf *)0)
1411 				break;
1412 			m = m->m_next;
1413 		}
1414 		if (m->m_len >= len) {
1415 			m->m_len -= len;
1416 			if (mp->m_flags & M_PKTHDR)
1417 				mp->m_pkthdr.len -= len;
1418 			return;
1419 		}
1420 		count -= len;
1421 		if (count < 0)
1422 			count = 0;
1423 		/*
1424 		 * Correct length for chain is "count".
1425 		 * Find the mbuf with last data, adjust its length,
1426 		 * and toss data from remaining mbufs on chain.
1427 		 */
1428 		m = mp;
1429 		if (m->m_flags & M_PKTHDR)
1430 			m->m_pkthdr.len = count;
1431 		for (; m; m = m->m_next) {
1432 			if (m->m_len >= count) {
1433 				m->m_len = count;
1434 				break;
1435 			}
1436 			count -= m->m_len;
1437 		}
1438 		while (m->m_next)
1439 			(m = m->m_next) ->m_len = 0;
1440 	}
1441 }
1442 
1443 /*
1444  * Rearange an mbuf chain so that len bytes are contiguous
1445  * and in the data area of an mbuf (so that mtod will work for a structure
1446  * of size len).  Returns the resulting mbuf chain on success, frees it and
1447  * returns null on failure.  If there is room, it will add up to
1448  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1449  * avoid being called next time.
1450  */
1451 #define MPFail (mbstat.m_mpfail)
1452 
1453 struct mbuf *
1454 m_pullup(struct mbuf *n, int len)
1455 {
1456 	struct mbuf *m;
1457 	int count;
1458 	int space;
1459 
1460 	/*
1461 	 * If first mbuf has no cluster, and has room for len bytes
1462 	 * without shifting current data, pullup into it,
1463 	 * otherwise allocate a new mbuf to prepend to the chain.
1464 	 */
1465 	if ((n->m_flags & M_EXT) == 0 &&
1466 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
1467 		if (n->m_len >= len)
1468 			return (n);
1469 		m = n;
1470 		n = n->m_next;
1471 		len -= m->m_len;
1472 	} else {
1473 		if (len > MHLEN)
1474 			goto bad;
1475 		MGET(m, MB_DONTWAIT, n->m_type);
1476 		if (m == 0)
1477 			goto bad;
1478 		m->m_len = 0;
1479 		if (n->m_flags & M_PKTHDR)
1480 			M_MOVE_PKTHDR(m, n);
1481 	}
1482 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1483 	do {
1484 		count = min(min(max(len, max_protohdr), space), n->m_len);
1485 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1486 		  (unsigned)count);
1487 		len -= count;
1488 		m->m_len += count;
1489 		n->m_len -= count;
1490 		space -= count;
1491 		if (n->m_len)
1492 			n->m_data += count;
1493 		else
1494 			n = m_free(n);
1495 	} while (len > 0 && n);
1496 	if (len > 0) {
1497 		(void) m_free(m);
1498 		goto bad;
1499 	}
1500 	m->m_next = n;
1501 	return (m);
1502 bad:
1503 	m_freem(n);
1504 	MPFail++;
1505 	return (0);
1506 }
1507 
1508 /*
1509  * Partition an mbuf chain in two pieces, returning the tail --
1510  * all but the first len0 bytes.  In case of failure, it returns NULL and
1511  * attempts to restore the chain to its original state.
1512  *
1513  * Note that the resulting mbufs might be read-only, because the new
1514  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1515  * the "breaking point" happens to lie within a cluster mbuf. Use the
1516  * M_WRITABLE() macro to check for this case.
1517  */
1518 struct mbuf *
1519 m_split(struct mbuf *m0, int len0, int wait)
1520 {
1521 	struct mbuf *m, *n;
1522 	unsigned len = len0, remain;
1523 
1524 	for (m = m0; m && len > m->m_len; m = m->m_next)
1525 		len -= m->m_len;
1526 	if (m == 0)
1527 		return (0);
1528 	remain = m->m_len - len;
1529 	if (m0->m_flags & M_PKTHDR) {
1530 		MGETHDR(n, wait, m0->m_type);
1531 		if (n == 0)
1532 			return (0);
1533 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1534 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1535 		m0->m_pkthdr.len = len0;
1536 		if (m->m_flags & M_EXT)
1537 			goto extpacket;
1538 		if (remain > MHLEN) {
1539 			/* m can't be the lead packet */
1540 			MH_ALIGN(n, 0);
1541 			n->m_next = m_split(m, len, wait);
1542 			if (n->m_next == 0) {
1543 				(void) m_free(n);
1544 				return (0);
1545 			} else {
1546 				n->m_len = 0;
1547 				return (n);
1548 			}
1549 		} else
1550 			MH_ALIGN(n, remain);
1551 	} else if (remain == 0) {
1552 		n = m->m_next;
1553 		m->m_next = 0;
1554 		return (n);
1555 	} else {
1556 		MGET(n, wait, m->m_type);
1557 		if (n == 0)
1558 			return (0);
1559 		M_ALIGN(n, remain);
1560 	}
1561 extpacket:
1562 	if (m->m_flags & M_EXT) {
1563 		n->m_data = m->m_data + len;
1564 		m_extref(m);
1565 		n->m_ext = m->m_ext;
1566 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1567 	} else {
1568 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1569 	}
1570 	n->m_len = remain;
1571 	m->m_len = len;
1572 	n->m_next = m->m_next;
1573 	m->m_next = 0;
1574 	return (n);
1575 }
1576 /*
1577  * Routine to copy from device local memory into mbufs.
1578  */
1579 struct mbuf *
1580 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
1581 	void (*copy) (char *from, caddr_t to, u_int len))
1582 {
1583 	struct mbuf *m;
1584 	struct mbuf *top = 0, **mp = &top;
1585 	int off = off0, len;
1586 	char *cp;
1587 	char *epkt;
1588 
1589 	cp = buf;
1590 	epkt = cp + totlen;
1591 	if (off) {
1592 		cp += off + 2 * sizeof(u_short);
1593 		totlen -= 2 * sizeof(u_short);
1594 	}
1595 	MGETHDR(m, MB_DONTWAIT, MT_DATA);
1596 	if (m == 0)
1597 		return (0);
1598 	m->m_pkthdr.rcvif = ifp;
1599 	m->m_pkthdr.len = totlen;
1600 	m->m_len = MHLEN;
1601 
1602 	while (totlen > 0) {
1603 		if (top) {
1604 			MGET(m, MB_DONTWAIT, MT_DATA);
1605 			if (m == 0) {
1606 				m_freem(top);
1607 				return (0);
1608 			}
1609 			m->m_len = MLEN;
1610 		}
1611 		len = min(totlen, epkt - cp);
1612 		if (len >= MINCLSIZE) {
1613 			MCLGET(m, MB_DONTWAIT);
1614 			if (m->m_flags & M_EXT)
1615 				m->m_len = len = min(len, MCLBYTES);
1616 			else
1617 				len = m->m_len;
1618 		} else {
1619 			/*
1620 			 * Place initial small packet/header at end of mbuf.
1621 			 */
1622 			if (len < m->m_len) {
1623 				if (top == 0 && len + max_linkhdr <= m->m_len)
1624 					m->m_data += max_linkhdr;
1625 				m->m_len = len;
1626 			} else
1627 				len = m->m_len;
1628 		}
1629 		if (copy)
1630 			copy(cp, mtod(m, caddr_t), (unsigned)len);
1631 		else
1632 			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1633 		cp += len;
1634 		*mp = m;
1635 		mp = &m->m_next;
1636 		totlen -= len;
1637 		if (cp == epkt)
1638 			cp = buf;
1639 	}
1640 	return (top);
1641 }
1642 
1643 /*
1644  * Copy data from a buffer back into the indicated mbuf chain,
1645  * starting "off" bytes from the beginning, extending the mbuf
1646  * chain if necessary.
1647  */
1648 void
1649 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1650 {
1651 	int mlen;
1652 	struct mbuf *m = m0, *n;
1653 	int totlen = 0;
1654 
1655 	if (m0 == 0)
1656 		return;
1657 	while (off > (mlen = m->m_len)) {
1658 		off -= mlen;
1659 		totlen += mlen;
1660 		if (m->m_next == 0) {
1661 			n = m_getclr(MB_DONTWAIT, m->m_type);
1662 			if (n == 0)
1663 				goto out;
1664 			n->m_len = min(MLEN, len + off);
1665 			m->m_next = n;
1666 		}
1667 		m = m->m_next;
1668 	}
1669 	while (len > 0) {
1670 		mlen = min (m->m_len - off, len);
1671 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1672 		cp += mlen;
1673 		len -= mlen;
1674 		mlen += off;
1675 		off = 0;
1676 		totlen += mlen;
1677 		if (len == 0)
1678 			break;
1679 		if (m->m_next == 0) {
1680 			n = m_get(MB_DONTWAIT, m->m_type);
1681 			if (n == 0)
1682 				break;
1683 			n->m_len = min(MLEN, len);
1684 			m->m_next = n;
1685 		}
1686 		m = m->m_next;
1687 	}
1688 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1689 		m->m_pkthdr.len = totlen;
1690 }
1691 
1692 void
1693 m_print(const struct mbuf *m)
1694 {
1695 	int len;
1696 	const struct mbuf *m2;
1697 
1698 	len = m->m_pkthdr.len;
1699 	m2 = m;
1700 	while (len) {
1701 		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1702 		len -= m2->m_len;
1703 		m2 = m2->m_next;
1704 	}
1705 	return;
1706 }
1707 
1708 /*
1709  * "Move" mbuf pkthdr from "from" to "to".
1710  * "from" must have M_PKTHDR set, and "to" must be empty.
1711  */
1712 void
1713 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1714 {
1715 	KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster"));
1716 
1717 	to->m_flags = from->m_flags & M_COPYFLAGS;
1718 	to->m_data = to->m_pktdat;
1719 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
1720 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
1721 	from->m_flags &= ~M_PKTHDR;
1722 }
1723 
1724 /*
1725  * Duplicate "from"'s mbuf pkthdr in "to".
1726  * "from" must have M_PKTHDR set, and "to" must be empty.
1727  * In particular, this does a deep copy of the packet tags.
1728  */
1729 int
1730 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1731 {
1732 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
1733 	if ((to->m_flags & M_EXT) == 0)
1734 		to->m_data = to->m_pktdat;
1735 	to->m_pkthdr = from->m_pkthdr;
1736 	SLIST_INIT(&to->m_pkthdr.tags);
1737 	return (m_tag_copy_chain(to, from, how));
1738 }
1739 
1740 /*
1741  * Defragment a mbuf chain, returning the shortest possible
1742  * chain of mbufs and clusters.  If allocation fails and
1743  * this cannot be completed, NULL will be returned, but
1744  * the passed in chain will be unchanged.  Upon success,
1745  * the original chain will be freed, and the new chain
1746  * will be returned.
1747  *
1748  * If a non-packet header is passed in, the original
1749  * mbuf (chain?) will be returned unharmed.
1750  *
1751  * m_defrag_nofree doesn't free the passed in mbuf.
1752  */
1753 struct mbuf *
1754 m_defrag(struct mbuf *m0, int how)
1755 {
1756 	struct mbuf *m_new;
1757 
1758 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
1759 		return (NULL);
1760 	if (m_new != m0)
1761 		m_freem(m0);
1762 	return (m_new);
1763 }
1764 
1765 struct mbuf *
1766 m_defrag_nofree(struct mbuf *m0, int how)
1767 {
1768 	struct mbuf	*m_new = NULL, *m_final = NULL;
1769 	int		progress = 0, length;
1770 
1771 	if (!(m0->m_flags & M_PKTHDR))
1772 		return (m0);
1773 
1774 #ifdef MBUF_STRESS_TEST
1775 	if (m_defragrandomfailures) {
1776 		int temp = arc4random() & 0xff;
1777 		if (temp == 0xba)
1778 			goto nospace;
1779 	}
1780 #endif
1781 
1782 	if (m0->m_pkthdr.len > MHLEN)
1783 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1784 	else
1785 		m_final = m_gethdr(how, MT_DATA);
1786 
1787 	if (m_final == NULL)
1788 		goto nospace;
1789 
1790 	if (m_dup_pkthdr(m_final, m0, how) == NULL)
1791 		goto nospace;
1792 
1793 	m_new = m_final;
1794 
1795 	while (progress < m0->m_pkthdr.len) {
1796 		length = m0->m_pkthdr.len - progress;
1797 		if (length > MCLBYTES)
1798 			length = MCLBYTES;
1799 
1800 		if (m_new == NULL) {
1801 			if (length > MLEN)
1802 				m_new = m_getcl(how, MT_DATA, 0);
1803 			else
1804 				m_new = m_get(how, MT_DATA);
1805 			if (m_new == NULL)
1806 				goto nospace;
1807 		}
1808 
1809 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1810 		progress += length;
1811 		m_new->m_len = length;
1812 		if (m_new != m_final)
1813 			m_cat(m_final, m_new);
1814 		m_new = NULL;
1815 	}
1816 	if (m0->m_next == NULL)
1817 		m_defraguseless++;
1818 	m_defragpackets++;
1819 	m_defragbytes += m_final->m_pkthdr.len;
1820 	return (m_final);
1821 nospace:
1822 	m_defragfailure++;
1823 	if (m_new)
1824 		m_free(m_new);
1825 	if (m_final)
1826 		m_freem(m_final);
1827 	return (NULL);
1828 }
1829 
1830 /*
1831  * Move data from uio into mbufs.
1832  * A length of zero means copy the whole uio.
1833  */
1834 struct mbuf *
1835 m_uiomove(struct uio *uio, int wait, int len0)
1836 {
1837 	struct mbuf *head;		/* result mbuf chain */
1838 	struct mbuf *m;			/* current working mbuf */
1839 	struct mbuf **mp;
1840 	int resid, datalen, error;
1841 
1842 	resid = (len0 == 0) ? uio->uio_resid : min(len0, uio->uio_resid);
1843 
1844 	head = NULL;
1845 	mp = &head;
1846 	do {
1847 		if (resid > MHLEN) {
1848 			m = m_getcl(wait, MT_DATA, head == NULL ? M_PKTHDR : 0);
1849 			if (m == NULL)
1850 				goto failed;
1851 			if (m->m_flags & M_PKTHDR)
1852 				m->m_pkthdr.len = 0;
1853 		} else {
1854 			if (head == NULL) {
1855 				MGETHDR(m, wait, MT_DATA);
1856 				if (m == NULL)
1857 					goto failed;
1858 				m->m_pkthdr.len = 0;
1859 				/* Leave room for protocol headers. */
1860 				if (resid < MHLEN)
1861 					MH_ALIGN(m, resid);
1862 			} else {
1863 				MGET(m, wait, MT_DATA);
1864 				if (m == NULL)
1865 					goto failed;
1866 			}
1867 		}
1868 		datalen = min(MCLBYTES, resid);
1869 		error = uiomove(mtod(m, caddr_t), datalen, uio);
1870 		if (error) {
1871 			m_free(m);
1872 			goto failed;
1873 		}
1874 		m->m_len = datalen;
1875 		*mp = m;
1876 		mp = &m->m_next;
1877 		head->m_pkthdr.len += datalen;
1878 		resid -= datalen;
1879 	} while (resid > 0);
1880 
1881 	return (head);
1882 
1883 failed:
1884 	if (head)
1885 		m_freem(head);
1886 	return (NULL);
1887 }
1888