xref: /dragonfly/sys/kern/uipc_mbuf.c (revision b40e316c)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
36  *
37  * License terms: all terms for the DragonFly license above plus the following:
38  *
39  * 4. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *
42  *	This product includes software developed by Jeffrey M. Hsu
43  *	for the DragonFly Project.
44  *
45  *    This requirement may be waived with permission from Jeffrey Hsu.
46  *    This requirement will sunset and may be removed on July 8 2005,
47  *    after which the standard DragonFly license (as shown above) will
48  *    apply.
49  */
50 
51 /*
52  * Copyright (c) 1982, 1986, 1988, 1991, 1993
53  *	The Regents of the University of California.  All rights reserved.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  * 3. All advertising materials mentioning features or use of this software
64  *    must display the following acknowledgement:
65  *	This product includes software developed by the University of
66  *	California, Berkeley and its contributors.
67  * 4. Neither the name of the University nor the names of its contributors
68  *    may be used to endorse or promote products derived from this software
69  *    without specific prior written permission.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  *
83  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
84  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
85  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.29 2004/11/18 01:42:26 dillon Exp $
86  */
87 
88 #include "opt_param.h"
89 #include "opt_mbuf_stress_test.h"
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/malloc.h>
93 #include <sys/mbuf.h>
94 #include <sys/kernel.h>
95 #include <sys/sysctl.h>
96 #include <sys/domain.h>
97 #include <sys/protosw.h>
98 #include <sys/uio.h>
99 #include <sys/thread.h>
100 #include <sys/globaldata.h>
101 #include <sys/thread2.h>
102 
103 #include <vm/vm.h>
104 #include <vm/vm_kern.h>
105 #include <vm/vm_extern.h>
106 
107 #ifdef INVARIANTS
108 #include <machine/cpu.h>
109 #endif
110 
111 /*
112  * mbuf cluster meta-data
113  */
114 typedef struct mbcluster {
115 	struct mbcluster *mcl_next;
116 	int32_t	mcl_magic;
117 	int32_t	mcl_refs;
118 	void	*mcl_data;
119 } *mbcluster_t;
120 
121 typedef struct mbuf *mbuf_t;
122 
123 #define MCL_MAGIC	0x6d62636c
124 
125 static void mbinit (void *);
126 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
127 
128 static u_long	mbtypes[MT_NTYPES];
129 
130 struct mbstat mbstat;
131 int	max_linkhdr;
132 int	max_protohdr;
133 int	max_hdr;
134 int	max_datalen;
135 int	m_defragpackets;
136 int	m_defragbytes;
137 int	m_defraguseless;
138 int	m_defragfailure;
139 #ifdef MBUF_STRESS_TEST
140 int	m_defragrandomfailures;
141 #endif
142 
143 int	nmbclusters;
144 int	nmbufs;
145 u_int	m_mballoc_wid = 0;
146 u_int	m_clalloc_wid = 0;
147 
148 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
149 	   &max_linkhdr, 0, "");
150 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
151 	   &max_protohdr, 0, "");
152 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
153 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
154 	   &max_datalen, 0, "");
155 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
156 	   &mbuf_wait, 0, "");
157 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
158 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
159 	   sizeof(mbtypes), "LU", "");
160 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RW,
161 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
162 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RW, &nmbufs, 0,
163 	   "Maximum number of mbufs available");
164 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
165 	   &m_defragpackets, 0, "");
166 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
167 	   &m_defragbytes, 0, "");
168 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
169 	   &m_defraguseless, 0, "");
170 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
171 	   &m_defragfailure, 0, "");
172 #ifdef MBUF_STRESS_TEST
173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
174 	   &m_defragrandomfailures, 0, "");
175 #endif
176 
177 static int mcl_pool_count;
178 static int mcl_pool_max = 20;
179 static int mcl_free_max = 1000;
180 static int mbuf_free_max = 5000;
181 
182 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_max, CTLFLAG_RW, &mcl_pool_max, 0,
183            "Maximum number of mbufs+cluster in free list");
184 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_count, CTLFLAG_RD, &mcl_pool_count, 0,
185            "Current number of mbufs+cluster in free list");
186 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_free_max, CTLFLAG_RW, &mcl_free_max, 0,
187            "Maximum number of clusters on the free list");
188 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_free_max, CTLFLAG_RW, &mbuf_free_max, 0,
189            "Maximum number of mbufs on the free list");
190 
191 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
192 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
193 
194 static mbuf_t mmbfree;
195 static mbcluster_t mclfree;
196 static struct mbuf *mcl_pool;
197 
198 static void m_reclaim (void);
199 static int m_mballoc(int nmb, int how);
200 static int m_clalloc(int ncl, int how);
201 static struct mbuf *m_mballoc_wait(int caller, int type);
202 static void m_mclref(void *arg);
203 static void m_mclfree(void *arg);
204 
205 #ifndef NMBCLUSTERS
206 #define NMBCLUSTERS	(512 + maxusers * 16)
207 #endif
208 #ifndef NMBUFS
209 #define NMBUFS		(nmbclusters * 4)
210 #endif
211 
212 /*
213  * Perform sanity checks of tunables declared above.
214  */
215 static void
216 tunable_mbinit(void *dummy)
217 {
218 
219 	/*
220 	 * This has to be done before VM init.
221 	 */
222 	nmbclusters = NMBCLUSTERS;
223 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
224 	nmbufs = NMBUFS;
225 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
226 	/* Sanity checks */
227 	if (nmbufs < nmbclusters * 2)
228 		nmbufs = nmbclusters * 2;
229 
230 	return;
231 }
232 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
233 
234 /* "number of clusters of pages" */
235 #define NCL_INIT	1
236 
237 #define NMB_INIT	16
238 
239 /* ARGSUSED*/
240 static void
241 mbinit(void *dummy)
242 {
243 	mmbfree = NULL;
244 	mclfree = NULL;
245 	mbstat.m_msize = MSIZE;
246 	mbstat.m_mclbytes = MCLBYTES;
247 	mbstat.m_minclsize = MINCLSIZE;
248 	mbstat.m_mlen = MLEN;
249 	mbstat.m_mhlen = MHLEN;
250 
251 	crit_enter();
252 	if (m_mballoc(NMB_INIT, MB_DONTWAIT) == 0)
253 		goto bad;
254 #if MCLBYTES <= PAGE_SIZE
255 	if (m_clalloc(NCL_INIT, MB_DONTWAIT) == 0)
256 		goto bad;
257 #else
258 	/* It's OK to call contigmalloc in this context. */
259 	if (m_clalloc(16, MB_WAIT) == 0)
260 		goto bad;
261 #endif
262 	crit_exit();
263 	return;
264 bad:
265 	crit_exit();
266 	panic("mbinit");
267 }
268 
269 /*
270  * Allocate at least nmb mbufs and place on mbuf free list.
271  * Returns the number of mbufs successfully allocated, 0 if none.
272  *
273  * Must be called while in a critical section.
274  */
275 static int
276 m_mballoc(int nmb, int how)
277 {
278 	int i;
279 	struct mbuf *m;
280 
281 	/*
282 	 * If we've hit the mbuf limit, stop allocating (or trying to)
283 	 * in order to avoid exhausting kernel memory entirely.
284 	 */
285 	if ((nmb + mbstat.m_mbufs) > nmbufs)
286 		return (0);
287 
288 	/*
289 	 * Attempt to allocate the requested number of mbufs, terminate when
290 	 * the allocation fails but if blocking is allowed allocate at least
291 	 * one.
292 	 */
293 	for (i = 0; i < nmb; ++i) {
294 		m = malloc(MSIZE, M_MBUF, M_NOWAIT|M_NULLOK|M_ZERO);
295 		if (m == NULL) {
296 			if (how == MB_WAIT) {
297 				mbstat.m_wait++;
298 				m = malloc(MSIZE, M_MBUF,
299 					    M_WAITOK|M_NULLOK|M_ZERO);
300 			}
301 			if (m == NULL)
302 				break;
303 		}
304 		m->m_next = mmbfree;
305 		mmbfree = m;
306 		++mbstat.m_mbufs;
307 		++mbtypes[MT_FREE];
308 		how = MB_DONTWAIT;
309 	}
310 	return(i);
311 }
312 
313 /*
314  * Once mbuf memory has been exhausted and if the call to the allocation macros
315  * (or, in some cases, functions) is with MB_WAIT, then it is necessary to rely
316  * solely on reclaimed mbufs. Here we wait for an mbuf to be freed for a
317  * designated (mbuf_wait) time.
318  */
319 static struct mbuf *
320 m_mballoc_wait(int caller, int type)
321 {
322 	struct mbuf *m;
323 
324 	crit_enter();
325 	m_mballoc_wid++;
326 	if ((tsleep(&m_mballoc_wid, 0, "mballc", mbuf_wait)) == EWOULDBLOCK)
327 		m_mballoc_wid--;
328 	crit_exit();
329 
330 	/*
331 	 * Now that we (think) that we've got something, we will redo an
332 	 * MGET, but avoid getting into another instance of m_mballoc_wait()
333 	 * XXX: We retry to fetch _even_ if the sleep timed out. This is left
334 	 *      this way, purposely, in the [unlikely] case that an mbuf was
335 	 *      freed but the sleep was not awakened in time.
336 	 */
337 	m = NULL;
338 	switch (caller) {
339 	case MGET_C:
340 		MGET(m, MB_DONTWAIT, type);
341 		break;
342 	case MGETHDR_C:
343 		MGETHDR(m, MB_DONTWAIT, type);
344 		break;
345 	default:
346 		panic("m_mballoc_wait: invalid caller (%d)", caller);
347 	}
348 
349 	crit_enter();
350 	if (m != NULL) {		/* We waited and got something... */
351 		mbstat.m_wait++;
352 		/* Wake up another if we have more free. */
353 		if (mmbfree != NULL)
354 			MMBWAKEUP();
355 	}
356 	crit_exit();
357 	return (m);
358 }
359 
360 #if MCLBYTES > PAGE_SIZE
361 static int i_want_my_mcl;
362 
363 static void
364 kproc_mclalloc(void)
365 {
366 	int status;
367 
368 	crit_enter();
369 	for (;;) {
370 		tsleep(&i_want_my_mcl, 0, "mclalloc", 0);
371 
372 		while (i_want_my_mcl > 0) {
373 			if (m_clalloc(1, MB_WAIT) == 0)
374 				printf("m_clalloc failed even in thread context!\n");
375 			--i_want_my_mcl;
376 		}
377 	}
378 	/* not reached */
379 	crit_exit();
380 }
381 
382 static struct thread *mclallocthread;
383 static struct kproc_desc mclalloc_kp = {
384 	"mclalloc",
385 	kproc_mclalloc,
386 	&mclallocthread
387 };
388 SYSINIT(mclallocthread, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
389 	   &mclalloc_kp);
390 #endif
391 
392 /*
393  * Allocate at least nmb mbuf clusters and place on mbuf free list.
394  * Returns the number of mbuf clusters successfully allocated, 0 if none.
395  *
396  * Must be called while in a critical section.
397  */
398 static int
399 m_clalloc(int ncl, int how)
400 {
401 	static int last_report;
402 	mbcluster_t mcl;
403 	void *data;
404 	int i;
405 
406 	/*
407 	 * If we've hit the mbuf cluster limit, stop allocating (or trying to).
408 	 */
409 	if ((ncl + mbstat.m_clusters) > nmbclusters)
410 		ncl = 0;
411 
412 	/*
413 	 * Attempt to allocate the requested number of mbuf clusters,
414 	 * terminate when the allocation fails but if blocking is allowed
415 	 * allocate at least one.
416 	 *
417 	 * We need to allocate two structures for each cluster... a
418 	 * ref counting / governing structure and the actual data.  MCLBYTES
419 	 * should be a power of 2 which means that the slab allocator will
420 	 * return a buffer that does not cross a page boundary.
421 	 */
422 	for (i = 0; i < ncl; ++i) {
423 		/*
424 		 * Meta structure
425 		 */
426 		mcl = malloc(sizeof(*mcl), M_MBUFCL, M_NOWAIT|M_NULLOK|M_ZERO);
427 		if (mcl == NULL && how == MB_WAIT) {
428 			mbstat.m_wait++;
429 			mcl = malloc(sizeof(*mcl),
430 					M_MBUFCL, M_WAITOK|M_NULLOK|M_ZERO);
431 		}
432 
433 		/*
434 		 * Physically contiguous data buffer.
435 		 */
436 #if MCLBYTES > PAGE_SIZE
437 		if (how != MB_WAIT) {
438 			i_want_my_mcl += ncl - i;
439 			wakeup(&i_want_my_mcl);
440 			mbstat.m_wait++;
441 			data = NULL;
442 		} else {
443 			data = contigmalloc_map(MCLBYTES, M_MBUFCL,
444 				M_WAITOK, 0ul, ~0ul, PAGE_SIZE, 0, kernel_map);
445 		}
446 #else
447 		data = malloc(MCLBYTES, M_MBUFCL, M_NOWAIT|M_NULLOK);
448 		if (data == NULL) {
449 			if (how == MB_WAIT) {
450 				mbstat.m_wait++;
451 				data = malloc(MCLBYTES, M_MBUFCL,
452 						M_WAITOK|M_NULLOK);
453 			}
454 		}
455 #endif
456 		if (data == NULL) {
457 			free(mcl, M_MBUFCL);
458 			break;
459 		}
460 		mcl->mcl_next = mclfree;
461 		mcl->mcl_data = data;
462 		mcl->mcl_magic = MCL_MAGIC;
463 		mcl->mcl_refs = 0;
464 		mclfree = mcl;
465 		++mbstat.m_clfree;
466 		++mbstat.m_clusters;
467 		how = MB_DONTWAIT;
468 	}
469 
470 	/*
471 	 * If we could not allocate any report failure no more often then
472 	 * once a second.
473 	 */
474 	if (i == 0) {
475 		mbstat.m_drops++;
476 		if (ticks < last_report || (ticks - last_report) >= hz) {
477 			last_report = ticks;
478 			printf("All mbuf clusters exhausted, please see tuning(7).\n");
479 		}
480 	}
481 	return (i);
482 }
483 
484 /*
485  * Once cluster memory has been exhausted and the allocation is called with
486  * MB_WAIT, we rely on the mclfree pointers. If nothing is free, we will
487  * sleep for a designated amount of time (mbuf_wait) or until we're woken up
488  * due to sudden mcluster availability.
489  *
490  * Must be called while in a critical section.
491  */
492 static void
493 m_clalloc_wait(void)
494 {
495 	/* If in interrupt context, and INVARIANTS, maintain sanity and die. */
496 	KASSERT(mycpu->gd_intr_nesting_level == 0,
497 		("CLALLOC: CANNOT WAIT IN INTERRUPT"));
498 
499 	/*
500 	 * Sleep until something's available or until we expire.
501 	 */
502 	m_clalloc_wid++;
503 	if ((tsleep(&m_clalloc_wid, 0, "mclalc", mbuf_wait)) == EWOULDBLOCK)
504 		m_clalloc_wid--;
505 
506 	/*
507 	 * Try the allocation once more, and if we see mor then two
508 	 * free entries wake up others as well.
509 	 */
510 	m_clalloc(1, MB_WAIT);
511 	if (mclfree && mclfree->mcl_next) {
512 		MCLWAKEUP();
513 	}
514 }
515 
516 /*
517  * Return the number of references to this mbuf's data.  0 is returned
518  * if the mbuf is not M_EXT, a reference count is returned if it is
519  * M_EXT|M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
520  */
521 int
522 m_sharecount(struct mbuf *m)
523 {
524     int count;
525 
526     switch(m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
527     case 0:
528 	count = 0;
529 	break;
530     case M_EXT:
531 	count = 99;
532 	break;
533     case M_EXT|M_EXT_CLUSTER:
534 	count = ((mbcluster_t)m->m_ext.ext_arg)->mcl_refs;
535 	break;
536     default:
537 	panic("bad mbuf flags: %p", m);
538 	count = 0;
539     }
540     return(count);
541 }
542 
543 /*
544  * change mbuf to new type
545  */
546 void
547 m_chtype(struct mbuf *m, int type)
548 {
549 	crit_enter();
550 	--mbtypes[m->m_type];
551 	++mbtypes[type];
552 	m->m_type = type;
553 	crit_exit();
554 }
555 
556 /*
557  * When MGET fails, ask protocols to free space when short of memory,
558  * then re-attempt to allocate an mbuf.
559  */
560 struct mbuf *
561 m_retry(int how, int t)
562 {
563 	struct mbuf *m;
564 
565 	/*
566 	 * Must only do the reclaim if not in an interrupt context.
567 	 */
568 	if (how == MB_WAIT) {
569 		KASSERT(mycpu->gd_intr_nesting_level == 0,
570 		    ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
571 		m_reclaim();
572 	}
573 
574 	/*
575 	 * Try to pull a new mbuf out of the cache, if the cache is empty
576 	 * try to allocate a new one and if that doesn't work we give up.
577 	 */
578 	crit_enter();
579 	if ((m = mmbfree) == NULL) {
580 		m_mballoc(1, how);
581 		if ((m = mmbfree) == NULL) {
582 			static int last_report;
583 
584 			mbstat.m_drops++;
585 			crit_exit();
586 			if (ticks < last_report ||
587 			    (ticks - last_report) >= hz) {
588 				last_report = ticks;
589 				printf("All mbufs exhausted, please see tuning(7).\n");
590 			}
591 			return (NULL);
592 		}
593 	}
594 
595 	/*
596 	 * Cache case, adjust globals before leaving the critical section
597 	 */
598 	mmbfree = m->m_next;
599 	mbtypes[MT_FREE]--;
600 	mbtypes[t]++;
601 	mbstat.m_wait++;
602 	crit_exit();
603 
604 	m->m_type = t;
605 	m->m_next = NULL;
606 	m->m_nextpkt = NULL;
607 	m->m_data = m->m_dat;
608 	m->m_flags = 0;
609 	return (m);
610 }
611 
612 /*
613  * As above; retry an MGETHDR.
614  */
615 struct mbuf *
616 m_retryhdr(int how, int t)
617 {
618 	struct mbuf *m;
619 
620 	/*
621 	 * Must only do the reclaim if not in an interrupt context.
622 	 */
623 	if (how == MB_WAIT) {
624 		KASSERT(mycpu->gd_intr_nesting_level == 0,
625 		    ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
626 		m_reclaim();
627 	}
628 
629 	/*
630 	 * Try to pull a new mbuf out of the cache, if the cache is empty
631 	 * try to allocate a new one and if that doesn't work we give up.
632 	 */
633 	crit_enter();
634 	if ((m = mmbfree) == NULL) {
635 		m_mballoc(1, how);
636 		if ((m = mmbfree) == NULL) {
637 			static int last_report;
638 
639 			mbstat.m_drops++;
640 			crit_exit();
641 			if (ticks < last_report ||
642 			    (ticks - last_report) >= hz) {
643 				last_report = ticks;
644 				printf("All mbufs exhausted, please see tuning(7).\n");
645 			}
646 			return (NULL);
647 		}
648 	}
649 
650 	/*
651 	 * Cache case, adjust globals before leaving the critical section
652 	 */
653 	mmbfree = m->m_next;
654 	mbtypes[MT_FREE]--;
655 	mbtypes[t]++;
656 	mbstat.m_wait++;
657 	crit_exit();
658 
659 	m->m_type = t;
660 	m->m_next = NULL;
661 	m->m_nextpkt = NULL;
662 	m->m_data = m->m_pktdat;
663 	m->m_flags = M_PKTHDR;
664 	m->m_pkthdr.rcvif = NULL;
665 	SLIST_INIT(&m->m_pkthdr.tags);
666 	m->m_pkthdr.csum_flags = 0;
667 	return (m);
668 }
669 
670 static void
671 m_reclaim(void)
672 {
673 	struct domain *dp;
674 	struct protosw *pr;
675 
676 	crit_enter();
677 	for (dp = domains; dp; dp = dp->dom_next) {
678 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
679 			if (pr->pr_drain)
680 				(*pr->pr_drain)();
681 		}
682 	}
683 	crit_exit();
684 	mbstat.m_drain++;
685 }
686 
687 /*
688  * Allocate an mbuf.  If no mbufs are immediately available try to
689  * bring a bunch more into our cache (mmbfree list).  A critical
690  * section is required to protect the mmbfree list and counters
691  * against interrupts.
692  */
693 struct mbuf *
694 m_get(int how, int type)
695 {
696 	struct mbuf *m;
697 
698 	/*
699 	 * Try to pull a new mbuf out of the cache, if the cache is empty
700 	 * try to allocate a new one and if that doesn't work try even harder
701 	 * by calling m_retryhdr().
702 	 */
703 	crit_enter();
704 	if ((m = mmbfree) == NULL) {
705 		m_mballoc(1, how);
706 		if ((m = mmbfree) == NULL) {
707 			crit_exit();
708 			m = m_retry(how, type);
709 			if (m == NULL && how == MB_WAIT)
710 				m = m_mballoc_wait(MGET_C, type);
711 			return (m);
712 		}
713 	}
714 
715 	/*
716 	 * Cache case, adjust globals before leaving the critical section
717 	 */
718 	mmbfree = m->m_next;
719 	mbtypes[MT_FREE]--;
720 	mbtypes[type]++;
721 	crit_exit();
722 
723 	m->m_type = type;
724 	m->m_next = NULL;
725 	m->m_nextpkt = NULL;
726 	m->m_data = m->m_dat;
727 	m->m_flags = 0;
728 	return (m);
729 }
730 
731 struct mbuf *
732 m_gethdr(int how, int type)
733 {
734 	struct mbuf *m;
735 
736 	/*
737 	 * Try to pull a new mbuf out of the cache, if the cache is empty
738 	 * try to allocate a new one and if that doesn't work try even harder
739 	 * by calling m_retryhdr().
740 	 */
741 	crit_enter();
742 	if ((m = mmbfree) == NULL) {
743 		m_mballoc(1, how);
744 		if ((m = mmbfree) == NULL) {
745 			crit_exit();
746 			m = m_retryhdr(how, type);
747 			if (m == NULL && how == MB_WAIT)
748 				m = m_mballoc_wait(MGETHDR_C, type);
749 			return(m);
750 		}
751 	}
752 
753 	/*
754 	 * Cache case, adjust globals before leaving the critical section
755 	 */
756 	mmbfree = m->m_next;
757 	mbtypes[MT_FREE]--;
758 	mbtypes[type]++;
759 	crit_exit();
760 
761 	m->m_type = type;
762 	m->m_next = NULL;
763 	m->m_nextpkt = NULL;
764 	m->m_data = m->m_pktdat;
765 	m->m_flags = M_PKTHDR;
766 	m->m_pkthdr.rcvif = NULL;
767 	SLIST_INIT(&m->m_pkthdr.tags);
768 	m->m_pkthdr.csum_flags = 0;
769 	m->m_pkthdr.pf_flags = 0;
770 	return (m);
771 }
772 
773 struct mbuf *
774 m_getclr(int how, int type)
775 {
776 	struct mbuf *m;
777 
778 	if ((m = m_get(how, type)) != NULL) {
779 		bzero(mtod(m, caddr_t), MLEN);
780 	}
781 	return (m);
782 }
783 
784 /*
785  * m_getcl() returns an mbuf with an attached cluster.
786  * Because many network drivers use this kind of buffers a lot, it is
787  * convenient to keep a small pool of free buffers of this kind.
788  * Even a small size such as 10 gives about 10% improvement in the
789  * forwarding rate in a bridge or router.
790  * The size of this free list is controlled by the sysctl variable
791  * mcl_pool_max. The list is populated on m_freem(), and used in
792  * m_getcl() if elements are available.
793  */
794 struct mbuf *
795 m_getcl(int how, short type, int flags)
796 {
797 	struct mbuf *mp;
798 
799 	crit_enter();
800 	if (flags & M_PKTHDR) {
801 		if (type == MT_DATA && mcl_pool) {
802 			mp = mcl_pool;
803 			mcl_pool = mp->m_nextpkt;
804 			--mcl_pool_count;
805 			crit_exit();
806 			mp->m_nextpkt = NULL;
807 			mp->m_data = mp->m_ext.ext_buf;
808 			mp->m_flags = M_PKTHDR|M_EXT|M_EXT_CLUSTER;
809 			mp->m_pkthdr.rcvif = NULL;
810 			mp->m_pkthdr.csum_flags = 0;
811 			return mp;
812 		}
813 		MGETHDR(mp, how, type);
814 	} else {
815 		MGET(mp, how, type);
816 	}
817 	if (mp) {
818 		m_mclget(mp, how);
819 		if ((mp->m_flags & M_EXT) == 0) {
820 			m_free(mp);
821 			mp = NULL;
822 		}
823 	}
824 	crit_exit();
825 	return (mp);
826 }
827 
828 /*
829  * struct mbuf *
830  * m_getm(m, len, how, type)
831  *
832  * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
833  * best) and return a pointer to the top of the allocated chain. If m is
834  * non-null, then we assume that it is a single mbuf or an mbuf chain to
835  * which we want len bytes worth of mbufs and/or clusters attached, and so
836  * if we succeed in allocating it, we will just return a pointer to m.
837  *
838  * If we happen to fail at any point during the allocation, we will free
839  * up everything we have already allocated and return NULL.
840  *
841  */
842 struct mbuf *
843 m_getm(struct mbuf *m, int len, int how, int type)
844 {
845 	struct mbuf *top, *tail, *mp, *mtail = NULL;
846 
847 	KASSERT(len >= 0, ("len is < 0 in m_getm"));
848 
849 	mp = m_get(how, type);
850 	if (mp == NULL) {
851 		return (NULL);
852 	} else if (len > MINCLSIZE) {
853 		m_mclget(mp, how);
854 		if ((mp->m_flags & M_EXT) == 0) {
855 			m_free(mp);
856 			return (NULL);
857 		}
858 	}
859 	mp->m_len = 0;
860 	len -= M_TRAILINGSPACE(mp);
861 
862 	if (m != NULL) {
863 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
864 			;
865 	} else {
866 		m = mp;
867 	}
868 
869 	top = tail = mp;
870 	while (len > 0) {
871 		mp = m_get(how, type);
872 		if (mp == NULL)
873 			goto failed;
874 
875 		tail->m_next = mp;
876 		tail = mp;
877 		if (len > MINCLSIZE) {
878 			m_mclget(mp, how);
879 			if ((mp->m_flags & M_EXT) == 0)
880 				goto failed;
881 		}
882 
883 		mp->m_len = 0;
884 		len -= M_TRAILINGSPACE(mp);
885 	}
886 
887 	if (mtail != NULL)
888 		mtail->m_next = top;
889 	return (m);
890 failed:
891 	m_freem(top);
892 	return (NULL);
893 }
894 
895 /*
896  *  m_mclget() - Adds a cluster to a normal mbuf, M_EXT is set on success.
897  */
898 void
899 m_mclget(struct mbuf *m, int how)
900 {
901 	mbcluster_t mcl;
902 
903 	KKASSERT((m->m_flags & M_EXT_OLD) == 0);
904 
905 	/*
906 	 * Allocate a cluster, return if we can't get one.
907 	 */
908 	crit_enter();
909 	if ((mcl = mclfree) == NULL) {
910 		m_clalloc(1, how);
911 		if ((mcl = mclfree) == NULL) {
912 			if (how == MB_WAIT) {
913 				m_clalloc_wait();
914 				mcl = mclfree;
915 			}
916 			if (mcl == NULL) {
917 				crit_exit();
918 				return;
919 			}
920 		}
921 	}
922 
923 	/*
924 	 * We have a cluster, unlink it from the free list and set the ref
925 	 * count.
926 	 */
927 	KKASSERT(mcl->mcl_refs == 0);
928 	mclfree = mcl->mcl_next;
929 	mcl->mcl_refs = 1;
930 	--mbstat.m_clfree;
931 	crit_exit();
932 
933 	/*
934 	 * Add the cluster to the mbuf.  The caller will detect that the
935 	 * mbuf now has an attached cluster.
936 	 */
937 	m->m_ext.ext_arg = mcl;
938 	m->m_ext.ext_buf = mcl->mcl_data;
939 	m->m_ext.ext_nref.new = m_mclref;
940 	m->m_ext.ext_nfree.new = m_mclfree;
941 	m->m_ext.ext_size = MCLBYTES;
942 
943 	m->m_data = m->m_ext.ext_buf;
944 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
945 }
946 
947 static void
948 m_mclfree(void *arg)
949 {
950 	mbcluster_t mcl = arg;
951 
952 	KKASSERT(mcl->mcl_magic == MCL_MAGIC);
953 	KKASSERT(mcl->mcl_refs > 0);
954 	crit_enter();
955 	if (--mcl->mcl_refs == 0) {
956 		if (mbstat.m_clfree < mcl_free_max) {
957 			mcl->mcl_next = mclfree;
958 			mclfree = mcl;
959 			++mbstat.m_clfree;
960 			MCLWAKEUP();
961 		} else {
962 			mcl->mcl_magic = -1;
963 			free(mcl->mcl_data, M_MBUFCL);
964 			free(mcl, M_MBUFCL);
965 			--mbstat.m_clusters;
966 		}
967 	}
968 	crit_exit();
969 }
970 
971 static void
972 m_mclref(void *arg)
973 {
974 	mbcluster_t mcl = arg;
975 
976 	KKASSERT(mcl->mcl_magic == MCL_MAGIC);
977 	crit_enter();
978 	++mcl->mcl_refs;
979 	crit_exit();
980 }
981 
982 /*
983  * Helper routines for M_EXT reference/free
984  */
985 static __inline void
986 m_extref(const struct mbuf *m)
987 {
988 	KKASSERT(m->m_ext.ext_nfree.any != NULL);
989 	crit_enter();
990 	if (m->m_flags & M_EXT_OLD)
991 		m->m_ext.ext_nref.old(m->m_ext.ext_buf, m->m_ext.ext_size);
992 	else
993 		m->m_ext.ext_nref.new(m->m_ext.ext_arg);
994 	crit_exit();
995 }
996 
997 /*
998  * m_free()
999  *
1000  * Free a single mbuf and any associated external storage.  The successor,
1001  * if any, is returned.
1002  *
1003  * We do need to check non-first mbuf for m_aux, since some of existing
1004  * code does not call M_PREPEND properly.
1005  * (example: call to bpf_mtap from drivers)
1006  */
1007 struct mbuf *
1008 m_free(struct mbuf *m)
1009 {
1010 	struct mbuf *n;
1011 
1012 	crit_enter();
1013 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
1014 
1015 	/*
1016 	 * Adjust our type count and delete any attached chains if the
1017 	 * mbuf is a packet header.
1018 	 */
1019 	if ((m->m_flags & M_PKTHDR) != 0)
1020 		m_tag_delete_chain(m, NULL);
1021 
1022 	/*
1023 	 * Place the mbuf on the appropriate free list.  Try to maintain a
1024 	 * small cache of mbuf+cluster pairs.
1025 	 */
1026 	n = m->m_next;
1027 	m->m_next = NULL;
1028 	if (m->m_flags & M_EXT) {
1029 		KKASSERT(m->m_ext.ext_nfree.any != NULL);
1030 		if (mcl_pool_count < mcl_pool_max && m && m->m_next == NULL &&
1031 		    (m->m_flags & (M_PKTHDR|M_EXT_CLUSTER)) == (M_PKTHDR|M_EXT_CLUSTER) &&
1032 		    m->m_type == MT_DATA && M_EXT_WRITABLE(m) ) {
1033 			KKASSERT(((mbcluster_t)m->m_ext.ext_arg)->mcl_magic == MCL_MAGIC);
1034 			m->m_nextpkt = mcl_pool;
1035 			mcl_pool = m;
1036 			++mcl_pool_count;
1037 			m = NULL;
1038 		} else {
1039 			if (m->m_flags & M_EXT_OLD)
1040 				m->m_ext.ext_nfree.old(m->m_ext.ext_buf, m->m_ext.ext_size);
1041 			else
1042 				m->m_ext.ext_nfree.new(m->m_ext.ext_arg);
1043 			m->m_flags = 0;
1044 			m->m_ext.ext_arg = NULL;
1045 			m->m_ext.ext_nref.new = NULL;
1046 			m->m_ext.ext_nfree.new = NULL;
1047 		}
1048 	}
1049 	if (m) {
1050 		--mbtypes[m->m_type];
1051 		if (mbtypes[MT_FREE] < mbuf_free_max) {
1052 			m->m_type = MT_FREE;
1053 			mbtypes[MT_FREE]++;
1054 			m->m_next = mmbfree;
1055 			mmbfree = m;
1056 			MMBWAKEUP();
1057 		} else {
1058 			free(m, M_MBUF);
1059 			--mbstat.m_mbufs;
1060 		}
1061 	}
1062 	crit_exit();
1063 	return (n);
1064 }
1065 
1066 void
1067 m_freem(struct mbuf *m)
1068 {
1069 	crit_enter();
1070 	while (m)
1071 		m = m_free(m);
1072 	crit_exit();
1073 }
1074 
1075 /*
1076  * Mbuffer utility routines.
1077  */
1078 
1079 /*
1080  * Lesser-used path for M_PREPEND:
1081  * allocate new mbuf to prepend to chain,
1082  * copy junk along.
1083  */
1084 struct mbuf *
1085 m_prepend(struct mbuf *m, int len, int how)
1086 {
1087 	struct mbuf *mn;
1088 
1089 	MGET(mn, how, m->m_type);
1090 	if (mn == (struct mbuf *)NULL) {
1091 		m_freem(m);
1092 		return ((struct mbuf *)NULL);
1093 	}
1094 	if (m->m_flags & M_PKTHDR)
1095 		M_MOVE_PKTHDR(mn, m);
1096 	mn->m_next = m;
1097 	m = mn;
1098 	if (len < MHLEN)
1099 		MH_ALIGN(m, len);
1100 	m->m_len = len;
1101 	return (m);
1102 }
1103 
1104 /*
1105  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1106  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1107  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1108  * Note that the copy is read-only, because clusters are not copied,
1109  * only their reference counts are incremented.
1110  */
1111 #define MCFail (mbstat.m_mcfail)
1112 
1113 struct mbuf *
1114 m_copym(const struct mbuf *m, int off0, int len, int wait)
1115 {
1116 	struct mbuf *n, **np;
1117 	int off = off0;
1118 	struct mbuf *top;
1119 	int copyhdr = 0;
1120 
1121 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1122 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1123 	if (off == 0 && m->m_flags & M_PKTHDR)
1124 		copyhdr = 1;
1125 	while (off > 0) {
1126 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1127 		if (off < m->m_len)
1128 			break;
1129 		off -= m->m_len;
1130 		m = m->m_next;
1131 	}
1132 	np = &top;
1133 	top = 0;
1134 	while (len > 0) {
1135 		if (m == 0) {
1136 			KASSERT(len == M_COPYALL,
1137 			    ("m_copym, length > size of mbuf chain"));
1138 			break;
1139 		}
1140 		MGET(n, wait, m->m_type);
1141 		*np = n;
1142 		if (n == 0)
1143 			goto nospace;
1144 		if (copyhdr) {
1145 			if (!m_dup_pkthdr(n, m, wait))
1146 				goto nospace;
1147 			if (len == M_COPYALL)
1148 				n->m_pkthdr.len -= off0;
1149 			else
1150 				n->m_pkthdr.len = len;
1151 			copyhdr = 0;
1152 		}
1153 		n->m_len = min(len, m->m_len - off);
1154 		if (m->m_flags & M_EXT) {
1155 			n->m_data = m->m_data + off;
1156 			m_extref(m);
1157 			n->m_ext = m->m_ext;
1158 			n->m_flags |= m->m_flags &
1159 					(M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1160 		} else {
1161 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1162 			    (unsigned)n->m_len);
1163 		}
1164 		if (len != M_COPYALL)
1165 			len -= n->m_len;
1166 		off = 0;
1167 		m = m->m_next;
1168 		np = &n->m_next;
1169 	}
1170 	if (top == 0)
1171 		MCFail++;
1172 	return (top);
1173 nospace:
1174 	m_freem(top);
1175 	MCFail++;
1176 	return (0);
1177 }
1178 
1179 /*
1180  * Copy an entire packet, including header (which must be present).
1181  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1182  * Note that the copy is read-only, because clusters are not copied,
1183  * only their reference counts are incremented.
1184  * Preserve alignment of the first mbuf so if the creator has left
1185  * some room at the beginning (e.g. for inserting protocol headers)
1186  * the copies also have the room available.
1187  */
1188 struct mbuf *
1189 m_copypacket(struct mbuf *m, int how)
1190 {
1191 	struct mbuf *top, *n, *o;
1192 
1193 	MGET(n, how, m->m_type);
1194 	top = n;
1195 	if (!n)
1196 		goto nospace;
1197 
1198 	if (!m_dup_pkthdr(n, m, how))
1199 		goto nospace;
1200 	n->m_len = m->m_len;
1201 	if (m->m_flags & M_EXT) {
1202 		n->m_data = m->m_data;
1203 		m_extref(m);
1204 		n->m_ext = m->m_ext;
1205 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1206 	} else {
1207 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1208 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1209 	}
1210 
1211 	m = m->m_next;
1212 	while (m) {
1213 		MGET(o, how, m->m_type);
1214 		if (!o)
1215 			goto nospace;
1216 
1217 		n->m_next = o;
1218 		n = n->m_next;
1219 
1220 		n->m_len = m->m_len;
1221 		if (m->m_flags & M_EXT) {
1222 			n->m_data = m->m_data;
1223 			m_extref(m);
1224 			n->m_ext = m->m_ext;
1225 			n->m_flags |= m->m_flags &
1226 					 (M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1227 		} else {
1228 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1229 		}
1230 
1231 		m = m->m_next;
1232 	}
1233 	return top;
1234 nospace:
1235 	m_freem(top);
1236 	MCFail++;
1237 	return 0;
1238 }
1239 
1240 /*
1241  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1242  * continuing for "len" bytes, into the indicated buffer.
1243  */
1244 void
1245 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1246 {
1247 	unsigned count;
1248 
1249 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1250 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1251 	while (off > 0) {
1252 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1253 		if (off < m->m_len)
1254 			break;
1255 		off -= m->m_len;
1256 		m = m->m_next;
1257 	}
1258 	while (len > 0) {
1259 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1260 		count = min(m->m_len - off, len);
1261 		bcopy(mtod(m, caddr_t) + off, cp, count);
1262 		len -= count;
1263 		cp += count;
1264 		off = 0;
1265 		m = m->m_next;
1266 	}
1267 }
1268 
1269 /*
1270  * Copy a packet header mbuf chain into a completely new chain, including
1271  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1272  * you need a writable copy of an mbuf chain.
1273  */
1274 struct mbuf *
1275 m_dup(struct mbuf *m, int how)
1276 {
1277 	struct mbuf **p, *top = NULL;
1278 	int remain, moff, nsize;
1279 
1280 	/* Sanity check */
1281 	if (m == NULL)
1282 		return (0);
1283 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__));
1284 
1285 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1286 	remain = m->m_pkthdr.len;
1287 	moff = 0;
1288 	p = &top;
1289 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1290 		struct mbuf *n;
1291 
1292 		/* Get the next new mbuf */
1293 		MGET(n, how, m->m_type);
1294 		if (n == NULL)
1295 			goto nospace;
1296 		if (top == NULL) {		/* first one, must be PKTHDR */
1297 			if (!m_dup_pkthdr(n, m, how))
1298 				goto nospace;
1299 			nsize = MHLEN;
1300 		} else				/* not the first one */
1301 			nsize = MLEN;
1302 		if (remain >= MINCLSIZE) {
1303 			MCLGET(n, how);
1304 			if ((n->m_flags & M_EXT) == 0) {
1305 				(void)m_free(n);
1306 				goto nospace;
1307 			}
1308 			nsize = MCLBYTES;
1309 		}
1310 		n->m_len = 0;
1311 
1312 		/* Link it into the new chain */
1313 		*p = n;
1314 		p = &n->m_next;
1315 
1316 		/* Copy data from original mbuf(s) into new mbuf */
1317 		while (n->m_len < nsize && m != NULL) {
1318 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1319 
1320 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1321 			moff += chunk;
1322 			n->m_len += chunk;
1323 			remain -= chunk;
1324 			if (moff == m->m_len) {
1325 				m = m->m_next;
1326 				moff = 0;
1327 			}
1328 		}
1329 
1330 		/* Check correct total mbuf length */
1331 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1332 		    	("%s: bogus m_pkthdr.len", __FUNCTION__));
1333 	}
1334 	return (top);
1335 
1336 nospace:
1337 	m_freem(top);
1338 	MCFail++;
1339 	return (0);
1340 }
1341 
1342 /*
1343  * Concatenate mbuf chain n to m.
1344  * Both chains must be of the same type (e.g. MT_DATA).
1345  * Any m_pkthdr is not updated.
1346  */
1347 void
1348 m_cat(struct mbuf *m, struct mbuf *n)
1349 {
1350 	while (m->m_next)
1351 		m = m->m_next;
1352 	while (n) {
1353 		if (m->m_flags & M_EXT ||
1354 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1355 			/* just join the two chains */
1356 			m->m_next = n;
1357 			return;
1358 		}
1359 		/* splat the data from one into the other */
1360 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1361 		    (u_int)n->m_len);
1362 		m->m_len += n->m_len;
1363 		n = m_free(n);
1364 	}
1365 }
1366 
1367 void
1368 m_adj(struct mbuf *mp, int req_len)
1369 {
1370 	int len = req_len;
1371 	struct mbuf *m;
1372 	int count;
1373 
1374 	if ((m = mp) == NULL)
1375 		return;
1376 	if (len >= 0) {
1377 		/*
1378 		 * Trim from head.
1379 		 */
1380 		while (m != NULL && len > 0) {
1381 			if (m->m_len <= len) {
1382 				len -= m->m_len;
1383 				m->m_len = 0;
1384 				m = m->m_next;
1385 			} else {
1386 				m->m_len -= len;
1387 				m->m_data += len;
1388 				len = 0;
1389 			}
1390 		}
1391 		m = mp;
1392 		if (mp->m_flags & M_PKTHDR)
1393 			m->m_pkthdr.len -= (req_len - len);
1394 	} else {
1395 		/*
1396 		 * Trim from tail.  Scan the mbuf chain,
1397 		 * calculating its length and finding the last mbuf.
1398 		 * If the adjustment only affects this mbuf, then just
1399 		 * adjust and return.  Otherwise, rescan and truncate
1400 		 * after the remaining size.
1401 		 */
1402 		len = -len;
1403 		count = 0;
1404 		for (;;) {
1405 			count += m->m_len;
1406 			if (m->m_next == (struct mbuf *)0)
1407 				break;
1408 			m = m->m_next;
1409 		}
1410 		if (m->m_len >= len) {
1411 			m->m_len -= len;
1412 			if (mp->m_flags & M_PKTHDR)
1413 				mp->m_pkthdr.len -= len;
1414 			return;
1415 		}
1416 		count -= len;
1417 		if (count < 0)
1418 			count = 0;
1419 		/*
1420 		 * Correct length for chain is "count".
1421 		 * Find the mbuf with last data, adjust its length,
1422 		 * and toss data from remaining mbufs on chain.
1423 		 */
1424 		m = mp;
1425 		if (m->m_flags & M_PKTHDR)
1426 			m->m_pkthdr.len = count;
1427 		for (; m; m = m->m_next) {
1428 			if (m->m_len >= count) {
1429 				m->m_len = count;
1430 				break;
1431 			}
1432 			count -= m->m_len;
1433 		}
1434 		while (m->m_next)
1435 			(m = m->m_next) ->m_len = 0;
1436 	}
1437 }
1438 
1439 /*
1440  * Rearange an mbuf chain so that len bytes are contiguous
1441  * and in the data area of an mbuf (so that mtod will work for a structure
1442  * of size len).  Returns the resulting mbuf chain on success, frees it and
1443  * returns null on failure.  If there is room, it will add up to
1444  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1445  * avoid being called next time.
1446  */
1447 #define MPFail (mbstat.m_mpfail)
1448 
1449 struct mbuf *
1450 m_pullup(struct mbuf *n, int len)
1451 {
1452 	struct mbuf *m;
1453 	int count;
1454 	int space;
1455 
1456 	/*
1457 	 * If first mbuf has no cluster, and has room for len bytes
1458 	 * without shifting current data, pullup into it,
1459 	 * otherwise allocate a new mbuf to prepend to the chain.
1460 	 */
1461 	if ((n->m_flags & M_EXT) == 0 &&
1462 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
1463 		if (n->m_len >= len)
1464 			return (n);
1465 		m = n;
1466 		n = n->m_next;
1467 		len -= m->m_len;
1468 	} else {
1469 		if (len > MHLEN)
1470 			goto bad;
1471 		MGET(m, MB_DONTWAIT, n->m_type);
1472 		if (m == 0)
1473 			goto bad;
1474 		m->m_len = 0;
1475 		if (n->m_flags & M_PKTHDR)
1476 			M_MOVE_PKTHDR(m, n);
1477 	}
1478 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1479 	do {
1480 		count = min(min(max(len, max_protohdr), space), n->m_len);
1481 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1482 		  (unsigned)count);
1483 		len -= count;
1484 		m->m_len += count;
1485 		n->m_len -= count;
1486 		space -= count;
1487 		if (n->m_len)
1488 			n->m_data += count;
1489 		else
1490 			n = m_free(n);
1491 	} while (len > 0 && n);
1492 	if (len > 0) {
1493 		(void) m_free(m);
1494 		goto bad;
1495 	}
1496 	m->m_next = n;
1497 	return (m);
1498 bad:
1499 	m_freem(n);
1500 	MPFail++;
1501 	return (0);
1502 }
1503 
1504 /*
1505  * Partition an mbuf chain in two pieces, returning the tail --
1506  * all but the first len0 bytes.  In case of failure, it returns NULL and
1507  * attempts to restore the chain to its original state.
1508  *
1509  * Note that the resulting mbufs might be read-only, because the new
1510  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1511  * the "breaking point" happens to lie within a cluster mbuf. Use the
1512  * M_WRITABLE() macro to check for this case.
1513  */
1514 struct mbuf *
1515 m_split(struct mbuf *m0, int len0, int wait)
1516 {
1517 	struct mbuf *m, *n;
1518 	unsigned len = len0, remain;
1519 
1520 	for (m = m0; m && len > m->m_len; m = m->m_next)
1521 		len -= m->m_len;
1522 	if (m == 0)
1523 		return (0);
1524 	remain = m->m_len - len;
1525 	if (m0->m_flags & M_PKTHDR) {
1526 		MGETHDR(n, wait, m0->m_type);
1527 		if (n == 0)
1528 			return (0);
1529 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1530 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1531 		m0->m_pkthdr.len = len0;
1532 		if (m->m_flags & M_EXT)
1533 			goto extpacket;
1534 		if (remain > MHLEN) {
1535 			/* m can't be the lead packet */
1536 			MH_ALIGN(n, 0);
1537 			n->m_next = m_split(m, len, wait);
1538 			if (n->m_next == 0) {
1539 				(void) m_free(n);
1540 				return (0);
1541 			} else {
1542 				n->m_len = 0;
1543 				return (n);
1544 			}
1545 		} else
1546 			MH_ALIGN(n, remain);
1547 	} else if (remain == 0) {
1548 		n = m->m_next;
1549 		m->m_next = 0;
1550 		return (n);
1551 	} else {
1552 		MGET(n, wait, m->m_type);
1553 		if (n == 0)
1554 			return (0);
1555 		M_ALIGN(n, remain);
1556 	}
1557 extpacket:
1558 	if (m->m_flags & M_EXT) {
1559 		n->m_data = m->m_data + len;
1560 		m_extref(m);
1561 		n->m_ext = m->m_ext;
1562 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_OLD | M_EXT_CLUSTER);
1563 	} else {
1564 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1565 	}
1566 	n->m_len = remain;
1567 	m->m_len = len;
1568 	n->m_next = m->m_next;
1569 	m->m_next = 0;
1570 	return (n);
1571 }
1572 /*
1573  * Routine to copy from device local memory into mbufs.
1574  */
1575 struct mbuf *
1576 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
1577 	void (*copy) (char *from, caddr_t to, u_int len))
1578 {
1579 	struct mbuf *m;
1580 	struct mbuf *top = 0, **mp = &top;
1581 	int off = off0, len;
1582 	char *cp;
1583 	char *epkt;
1584 
1585 	cp = buf;
1586 	epkt = cp + totlen;
1587 	if (off) {
1588 		cp += off + 2 * sizeof(u_short);
1589 		totlen -= 2 * sizeof(u_short);
1590 	}
1591 	MGETHDR(m, MB_DONTWAIT, MT_DATA);
1592 	if (m == 0)
1593 		return (0);
1594 	m->m_pkthdr.rcvif = ifp;
1595 	m->m_pkthdr.len = totlen;
1596 	m->m_len = MHLEN;
1597 
1598 	while (totlen > 0) {
1599 		if (top) {
1600 			MGET(m, MB_DONTWAIT, MT_DATA);
1601 			if (m == 0) {
1602 				m_freem(top);
1603 				return (0);
1604 			}
1605 			m->m_len = MLEN;
1606 		}
1607 		len = min(totlen, epkt - cp);
1608 		if (len >= MINCLSIZE) {
1609 			MCLGET(m, MB_DONTWAIT);
1610 			if (m->m_flags & M_EXT)
1611 				m->m_len = len = min(len, MCLBYTES);
1612 			else
1613 				len = m->m_len;
1614 		} else {
1615 			/*
1616 			 * Place initial small packet/header at end of mbuf.
1617 			 */
1618 			if (len < m->m_len) {
1619 				if (top == 0 && len + max_linkhdr <= m->m_len)
1620 					m->m_data += max_linkhdr;
1621 				m->m_len = len;
1622 			} else
1623 				len = m->m_len;
1624 		}
1625 		if (copy)
1626 			copy(cp, mtod(m, caddr_t), (unsigned)len);
1627 		else
1628 			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1629 		cp += len;
1630 		*mp = m;
1631 		mp = &m->m_next;
1632 		totlen -= len;
1633 		if (cp == epkt)
1634 			cp = buf;
1635 	}
1636 	return (top);
1637 }
1638 
1639 /*
1640  * Copy data from a buffer back into the indicated mbuf chain,
1641  * starting "off" bytes from the beginning, extending the mbuf
1642  * chain if necessary.
1643  */
1644 void
1645 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1646 {
1647 	int mlen;
1648 	struct mbuf *m = m0, *n;
1649 	int totlen = 0;
1650 
1651 	if (m0 == 0)
1652 		return;
1653 	while (off > (mlen = m->m_len)) {
1654 		off -= mlen;
1655 		totlen += mlen;
1656 		if (m->m_next == 0) {
1657 			n = m_getclr(MB_DONTWAIT, m->m_type);
1658 			if (n == 0)
1659 				goto out;
1660 			n->m_len = min(MLEN, len + off);
1661 			m->m_next = n;
1662 		}
1663 		m = m->m_next;
1664 	}
1665 	while (len > 0) {
1666 		mlen = min (m->m_len - off, len);
1667 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1668 		cp += mlen;
1669 		len -= mlen;
1670 		mlen += off;
1671 		off = 0;
1672 		totlen += mlen;
1673 		if (len == 0)
1674 			break;
1675 		if (m->m_next == 0) {
1676 			n = m_get(MB_DONTWAIT, m->m_type);
1677 			if (n == 0)
1678 				break;
1679 			n->m_len = min(MLEN, len);
1680 			m->m_next = n;
1681 		}
1682 		m = m->m_next;
1683 	}
1684 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1685 		m->m_pkthdr.len = totlen;
1686 }
1687 
1688 void
1689 m_print(const struct mbuf *m)
1690 {
1691 	int len;
1692 	const struct mbuf *m2;
1693 
1694 	len = m->m_pkthdr.len;
1695 	m2 = m;
1696 	while (len) {
1697 		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1698 		len -= m2->m_len;
1699 		m2 = m2->m_next;
1700 	}
1701 	return;
1702 }
1703 
1704 /*
1705  * "Move" mbuf pkthdr from "from" to "to".
1706  * "from" must have M_PKTHDR set, and "to" must be empty.
1707  */
1708 void
1709 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1710 {
1711 	KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster"));
1712 
1713 	to->m_flags = from->m_flags & M_COPYFLAGS;
1714 	to->m_data = to->m_pktdat;
1715 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
1716 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
1717 	from->m_flags &= ~M_PKTHDR;
1718 }
1719 
1720 /*
1721  * Duplicate "from"'s mbuf pkthdr in "to".
1722  * "from" must have M_PKTHDR set, and "to" must be empty.
1723  * In particular, this does a deep copy of the packet tags.
1724  */
1725 int
1726 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1727 {
1728 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
1729 	if ((to->m_flags & M_EXT) == 0)
1730 		to->m_data = to->m_pktdat;
1731 	to->m_pkthdr = from->m_pkthdr;
1732 	SLIST_INIT(&to->m_pkthdr.tags);
1733 	return (m_tag_copy_chain(to, from, how));
1734 }
1735 
1736 /*
1737  * Defragment a mbuf chain, returning the shortest possible
1738  * chain of mbufs and clusters.  If allocation fails and
1739  * this cannot be completed, NULL will be returned, but
1740  * the passed in chain will be unchanged.  Upon success,
1741  * the original chain will be freed, and the new chain
1742  * will be returned.
1743  *
1744  * If a non-packet header is passed in, the original
1745  * mbuf (chain?) will be returned unharmed.
1746  */
1747 struct mbuf *
1748 m_defrag(struct mbuf *m0, int how)
1749 {
1750 	struct mbuf	*m_new = NULL, *m_final = NULL;
1751 	int		progress = 0, length;
1752 
1753 	if (!(m0->m_flags & M_PKTHDR))
1754 		return (m0);
1755 
1756 #ifdef MBUF_STRESS_TEST
1757 	if (m_defragrandomfailures) {
1758 		int temp = arc4random() & 0xff;
1759 		if (temp == 0xba)
1760 			goto nospace;
1761 	}
1762 #endif
1763 
1764 	if (m0->m_pkthdr.len > MHLEN)
1765 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1766 	else
1767 		m_final = m_gethdr(how, MT_DATA);
1768 
1769 	if (m_final == NULL)
1770 		goto nospace;
1771 
1772 	if (m_dup_pkthdr(m_final, m0, how) == NULL)
1773 		goto nospace;
1774 
1775 	m_new = m_final;
1776 
1777 	while (progress < m0->m_pkthdr.len) {
1778 		length = m0->m_pkthdr.len - progress;
1779 		if (length > MCLBYTES)
1780 			length = MCLBYTES;
1781 
1782 		if (m_new == NULL) {
1783 			if (length > MLEN)
1784 				m_new = m_getcl(how, MT_DATA, 0);
1785 			else
1786 				m_new = m_get(how, MT_DATA);
1787 			if (m_new == NULL)
1788 				goto nospace;
1789 		}
1790 
1791 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1792 		progress += length;
1793 		m_new->m_len = length;
1794 		if (m_new != m_final)
1795 			m_cat(m_final, m_new);
1796 		m_new = NULL;
1797 	}
1798 	if (m0->m_next == NULL)
1799 		m_defraguseless++;
1800 	m_freem(m0);
1801 	m0 = m_final;
1802 	m_defragpackets++;
1803 	m_defragbytes += m0->m_pkthdr.len;
1804 	return (m0);
1805 nospace:
1806 	m_defragfailure++;
1807 	if (m_new)
1808 		m_free(m_new);
1809 	if (m_final)
1810 		m_freem(m_final);
1811 	return (NULL);
1812 }
1813 
1814 /*
1815  * Move data from uio into mbufs.
1816  * A length of zero means copy the whole uio.
1817  */
1818 struct mbuf *
1819 m_uiomove(struct uio *uio, int wait, int len0)
1820 {
1821 	struct mbuf *head;		/* result mbuf chain */
1822 	struct mbuf *m;			/* current working mbuf */
1823 	struct mbuf **mp;
1824 	int resid, datalen, error;
1825 
1826 	resid = (len0 == 0) ? uio->uio_resid : min(len0, uio->uio_resid);
1827 
1828 	head = NULL;
1829 	mp = &head;
1830 	do {
1831 		if (resid > MHLEN) {
1832 			m = m_getcl(wait, MT_DATA, head == NULL ? M_PKTHDR : 0);
1833 			if (m == NULL)
1834 				goto failed;
1835 			if (m->m_flags & M_PKTHDR)
1836 				m->m_pkthdr.len = 0;
1837 		} else {
1838 			if (head == NULL) {
1839 				MGETHDR(m, wait, MT_DATA);
1840 				if (m == NULL)
1841 					goto failed;
1842 				m->m_pkthdr.len = 0;
1843 				/* Leave room for protocol headers. */
1844 				if (resid < MHLEN)
1845 					MH_ALIGN(m, resid);
1846 			} else {
1847 				MGET(m, wait, MT_DATA);
1848 				if (m == NULL)
1849 					goto failed;
1850 			}
1851 		}
1852 		datalen = min(MCLBYTES, resid);
1853 		error = uiomove(mtod(m, caddr_t), datalen, uio);
1854 		if (error) {
1855 			m_free(m);
1856 			goto failed;
1857 		}
1858 		m->m_len = datalen;
1859 		*mp = m;
1860 		mp = &m->m_next;
1861 		head->m_pkthdr.len += datalen;
1862 		resid -= datalen;
1863 	} while (resid > 0);
1864 
1865 	return (head);
1866 
1867 failed:
1868 	if (head)
1869 		m_freem(head);
1870 	return (NULL);
1871 }
1872