1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #include "opt_param.h"
34 #include "opt_mbuf_stress_test.h"
35 #include "opt_mbuf_profiling.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/sysctl.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47 #include <sys/uio.h>
48 #include <sys/vmmeter.h>
49 #include <sys/sbuf.h>
50 #include <sys/sdt.h>
51 #include <vm/vm.h>
52 #include <vm/vm_pageout.h>
53 #include <vm/vm_page.h>
54
55 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
56 "struct mbuf *", "mbufinfo_t *",
57 "uint32_t", "uint32_t",
58 "uint16_t", "uint16_t",
59 "uint32_t", "uint32_t",
60 "uint32_t", "uint32_t");
61
62 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw,
63 "uint32_t", "uint32_t",
64 "uint16_t", "uint16_t",
65 "struct mbuf *", "mbufinfo_t *");
66
67 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
68 "uint32_t", "uint32_t",
69 "uint16_t", "uint16_t",
70 "struct mbuf *", "mbufinfo_t *");
71
72 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw,
73 "uint32_t", "uint32_t",
74 "uint16_t", "uint16_t",
75 "struct mbuf *", "mbufinfo_t *");
76
77 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
78 "uint32_t", "uint32_t",
79 "uint16_t", "uint16_t",
80 "struct mbuf *", "mbufinfo_t *");
81
82 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
83 "uint32_t", "uint32_t",
84 "uint16_t", "uint16_t",
85 "uint32_t", "uint32_t",
86 "struct mbuf *", "mbufinfo_t *");
87
88 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl,
89 "uint32_t", "uint32_t",
90 "uint16_t", "uint16_t",
91 "uint32_t", "uint32_t",
92 "uint32_t", "uint32_t",
93 "struct mbuf *", "mbufinfo_t *");
94
95 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
96 "struct mbuf *", "mbufinfo_t *",
97 "uint32_t", "uint32_t",
98 "uint32_t", "uint32_t");
99
100 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
101 "struct mbuf *", "mbufinfo_t *",
102 "uint32_t", "uint32_t",
103 "uint32_t", "uint32_t",
104 "void*", "void*");
105
106 SDT_PROBE_DEFINE(sdt, , , m__cljset);
107
108 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
109 "struct mbuf *", "mbufinfo_t *");
110
111 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
112 "struct mbuf *", "mbufinfo_t *");
113
114 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freemp,
115 "struct mbuf *", "mbufinfo_t *");
116
117 #include <security/mac/mac_framework.h>
118
119 /*
120 * Provide minimum possible defaults for link and protocol header space,
121 * assuming IPv4 over Ethernet. Enabling IPv6, IEEE802.11 or some other
122 * protocol may grow these values.
123 */
124 u_int max_linkhdr = 16;
125 u_int max_protohdr = 40;
126 u_int max_hdr = 16 + 40;
127 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
128 &max_linkhdr, 16, "Size of largest link layer header");
129 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
130 &max_protohdr, 40, "Size of largest protocol layer header");
131 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
132 &max_hdr, 16 + 40, "Size of largest link plus protocol header");
133
134 static void
max_hdr_grow(void)135 max_hdr_grow(void)
136 {
137
138 max_hdr = max_linkhdr + max_protohdr;
139 MPASS(max_hdr <= MHLEN);
140 }
141
142 void
max_linkhdr_grow(u_int new)143 max_linkhdr_grow(u_int new)
144 {
145
146 if (new > max_linkhdr) {
147 max_linkhdr = new;
148 max_hdr_grow();
149 }
150 }
151
152 void
max_protohdr_grow(u_int new)153 max_protohdr_grow(u_int new)
154 {
155
156 if (new > max_protohdr) {
157 max_protohdr = new;
158 max_hdr_grow();
159 }
160 }
161
162 #ifdef MBUF_STRESS_TEST
163 int m_defragpackets;
164 int m_defragbytes;
165 int m_defraguseless;
166 int m_defragfailure;
167 int m_defragrandomfailures;
168
169 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
170 &m_defragpackets, 0, "");
171 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
172 &m_defragbytes, 0, "");
173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
174 &m_defraguseless, 0, "");
175 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
176 &m_defragfailure, 0, "");
177 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
178 &m_defragrandomfailures, 0, "");
179 #endif
180
181 /*
182 * Ensure the correct size of various mbuf parameters. It could be off due
183 * to compiler-induced padding and alignment artifacts.
184 */
185 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
186 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
187
188 /*
189 * mbuf data storage should be 64-bit aligned regardless of architectural
190 * pointer size; check this is the case with and without a packet header.
191 */
192 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
193 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
194
195 /*
196 * While the specific values here don't matter too much (i.e., +/- a few
197 * words), we do want to ensure that changes to these values are carefully
198 * reasoned about and properly documented. This is especially the case as
199 * network-protocol and device-driver modules encode these layouts, and must
200 * be recompiled if the structures change. Check these values at compile time
201 * against the ones documented in comments in mbuf.h.
202 *
203 * NB: Possibly they should be documented there via #define's and not just
204 * comments.
205 */
206 #if defined(__LP64__)
207 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
208 CTASSERT(sizeof(struct pkthdr) == 64);
209 CTASSERT(sizeof(struct m_ext) == 160);
210 #else
211 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
212 CTASSERT(sizeof(struct pkthdr) == 56);
213 #if defined(__powerpc__) && defined(BOOKE)
214 /* PowerPC booke has 64-bit physical pointers. */
215 CTASSERT(sizeof(struct m_ext) == 176);
216 #else
217 CTASSERT(sizeof(struct m_ext) == 172);
218 #endif
219 #endif
220
221 /*
222 * Assert that the queue(3) macros produce code of the same size as an old
223 * plain pointer does.
224 */
225 #ifdef INVARIANTS
226 static struct mbuf __used m_assertbuf;
227 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
228 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
229 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
230 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
231 #endif
232
233 /*
234 * Attach the cluster from *m to *n, set up m_ext in *n
235 * and bump the refcount of the cluster.
236 */
237 void
mb_dupcl(struct mbuf * n,struct mbuf * m)238 mb_dupcl(struct mbuf *n, struct mbuf *m)
239 {
240 volatile u_int *refcnt;
241
242 KASSERT(m->m_flags & (M_EXT | M_EXTPG),
243 ("%s: M_EXT | M_EXTPG not set on %p", __func__, m));
244 KASSERT(!(n->m_flags & (M_EXT | M_EXTPG)),
245 ("%s: M_EXT | M_EXTPG set on %p", __func__, n));
246
247 /*
248 * Cache access optimization.
249 *
250 * o Regular M_EXT storage doesn't need full copy of m_ext, since
251 * the holder of the 'ext_count' is responsible to carry the free
252 * routine and its arguments.
253 * o M_EXTPG data is split between main part of mbuf and m_ext, the
254 * main part is copied in full, the m_ext part is similar to M_EXT.
255 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is
256 * special - it needs full copy of m_ext into each mbuf, since any
257 * copy could end up as the last to free.
258 */
259 if (m->m_flags & M_EXTPG) {
260 bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy,
261 __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy));
262 bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen);
263 } else if (m->m_ext.ext_type == EXT_EXTREF)
264 bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
265 else
266 bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
267
268 n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG);
269
270 /* See if this is the mbuf that holds the embedded refcount. */
271 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
272 refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
273 n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
274 } else {
275 KASSERT(m->m_ext.ext_cnt != NULL,
276 ("%s: no refcounting pointer on %p", __func__, m));
277 refcnt = m->m_ext.ext_cnt;
278 }
279
280 if (*refcnt == 1)
281 *refcnt += 1;
282 else
283 atomic_add_int(refcnt, 1);
284 }
285
286 void
m_demote_pkthdr(struct mbuf * m)287 m_demote_pkthdr(struct mbuf *m)
288 {
289
290 M_ASSERTPKTHDR(m);
291 M_ASSERT_NO_SND_TAG(m);
292
293 m_tag_delete_chain(m, NULL);
294 m->m_flags &= ~M_PKTHDR;
295 bzero(&m->m_pkthdr, sizeof(struct pkthdr));
296 }
297
298 /*
299 * Clean up mbuf (chain) from any tags and packet headers.
300 * If "all" is set then the first mbuf in the chain will be
301 * cleaned too.
302 */
303 void
m_demote(struct mbuf * m0,int all,int flags)304 m_demote(struct mbuf *m0, int all, int flags)
305 {
306 struct mbuf *m;
307
308 flags |= M_DEMOTEFLAGS;
309
310 for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
311 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
312 __func__, m, m0));
313 if (m->m_flags & M_PKTHDR)
314 m_demote_pkthdr(m);
315 m->m_flags &= flags;
316 }
317 }
318
319 /*
320 * Sanity checks on mbuf (chain) for use in KASSERT() and general
321 * debugging.
322 * Returns 0 or panics when bad and 1 on all tests passed.
323 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
324 * blow up later.
325 */
326 int
m_sanity(struct mbuf * m0,int sanitize)327 m_sanity(struct mbuf *m0, int sanitize)
328 {
329 struct mbuf *m;
330 caddr_t a, b;
331 int pktlen = 0;
332
333 #ifdef INVARIANTS
334 #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m)
335 #else
336 #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m)
337 #endif
338
339 for (m = m0; m != NULL; m = m->m_next) {
340 /*
341 * Basic pointer checks. If any of these fails then some
342 * unrelated kernel memory before or after us is trashed.
343 * No way to recover from that.
344 */
345 a = M_START(m);
346 b = a + M_SIZE(m);
347 if ((caddr_t)m->m_data < a)
348 M_SANITY_ACTION("m_data outside mbuf data range left");
349 if ((caddr_t)m->m_data > b)
350 M_SANITY_ACTION("m_data outside mbuf data range right");
351 if ((caddr_t)m->m_data + m->m_len > b)
352 M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
353
354 /* m->m_nextpkt may only be set on first mbuf in chain. */
355 if (m != m0 && m->m_nextpkt != NULL) {
356 if (sanitize) {
357 m_freem(m->m_nextpkt);
358 m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
359 } else
360 M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
361 }
362
363 /* packet length (not mbuf length!) calculation */
364 if (m0->m_flags & M_PKTHDR)
365 pktlen += m->m_len;
366
367 /* m_tags may only be attached to first mbuf in chain. */
368 if (m != m0 && m->m_flags & M_PKTHDR &&
369 !SLIST_EMPTY(&m->m_pkthdr.tags)) {
370 if (sanitize) {
371 m_tag_delete_chain(m, NULL);
372 /* put in 0xDEADC0DE perhaps? */
373 } else
374 M_SANITY_ACTION("m_tags on in-chain mbuf");
375 }
376
377 /* M_PKTHDR may only be set on first mbuf in chain */
378 if (m != m0 && m->m_flags & M_PKTHDR) {
379 if (sanitize) {
380 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
381 m->m_flags &= ~M_PKTHDR;
382 /* put in 0xDEADCODE and leave hdr flag in */
383 } else
384 M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
385 }
386 }
387 m = m0;
388 if (pktlen && pktlen != m->m_pkthdr.len) {
389 if (sanitize)
390 m->m_pkthdr.len = 0;
391 else
392 M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
393 }
394 return 1;
395
396 #undef M_SANITY_ACTION
397 }
398
399 /*
400 * Non-inlined part of m_init().
401 */
402 int
m_pkthdr_init(struct mbuf * m,int how)403 m_pkthdr_init(struct mbuf *m, int how)
404 {
405 #ifdef MAC
406 int error;
407 #endif
408 m->m_data = m->m_pktdat;
409 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
410 #ifdef NUMA
411 m->m_pkthdr.numa_domain = M_NODOM;
412 #endif
413 #ifdef MAC
414 /* If the label init fails, fail the alloc */
415 error = mac_mbuf_init(m, how);
416 if (error)
417 return (error);
418 #endif
419
420 return (0);
421 }
422
423 /*
424 * "Move" mbuf pkthdr from "from" to "to".
425 * "from" must have M_PKTHDR set, and "to" must be empty.
426 */
427 void
m_move_pkthdr(struct mbuf * to,struct mbuf * from)428 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
429 {
430
431 #if 0
432 /* see below for why these are not enabled */
433 M_ASSERTPKTHDR(to);
434 /* Note: with MAC, this may not be a good assertion. */
435 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
436 ("m_move_pkthdr: to has tags"));
437 #endif
438 #ifdef MAC
439 /*
440 * XXXMAC: It could be this should also occur for non-MAC?
441 */
442 if (to->m_flags & M_PKTHDR)
443 m_tag_delete_chain(to, NULL);
444 #endif
445 to->m_flags = (from->m_flags & M_COPYFLAGS) |
446 (to->m_flags & (M_EXT | M_EXTPG));
447 if ((to->m_flags & M_EXT) == 0)
448 to->m_data = to->m_pktdat;
449 to->m_pkthdr = from->m_pkthdr; /* especially tags */
450 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */
451 from->m_flags &= ~M_PKTHDR;
452 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
453 from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
454 from->m_pkthdr.snd_tag = NULL;
455 }
456 }
457
458 /*
459 * Duplicate "from"'s mbuf pkthdr in "to".
460 * "from" must have M_PKTHDR set, and "to" must be empty.
461 * In particular, this does a deep copy of the packet tags.
462 */
463 int
m_dup_pkthdr(struct mbuf * to,const struct mbuf * from,int how)464 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
465 {
466
467 #if 0
468 /*
469 * The mbuf allocator only initializes the pkthdr
470 * when the mbuf is allocated with m_gethdr(). Many users
471 * (e.g. m_copy*, m_prepend) use m_get() and then
472 * smash the pkthdr as needed causing these
473 * assertions to trip. For now just disable them.
474 */
475 M_ASSERTPKTHDR(to);
476 /* Note: with MAC, this may not be a good assertion. */
477 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
478 #endif
479 MBUF_CHECKSLEEP(how);
480 #ifdef MAC
481 if (to->m_flags & M_PKTHDR)
482 m_tag_delete_chain(to, NULL);
483 #endif
484 to->m_flags = (from->m_flags & M_COPYFLAGS) |
485 (to->m_flags & (M_EXT | M_EXTPG));
486 if ((to->m_flags & M_EXT) == 0)
487 to->m_data = to->m_pktdat;
488 to->m_pkthdr = from->m_pkthdr;
489 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
490 m_snd_tag_ref(from->m_pkthdr.snd_tag);
491 SLIST_INIT(&to->m_pkthdr.tags);
492 return (m_tag_copy_chain(to, from, how));
493 }
494
495 /*
496 * Lesser-used path for M_PREPEND:
497 * allocate new mbuf to prepend to chain,
498 * copy junk along.
499 */
500 struct mbuf *
m_prepend(struct mbuf * m,int len,int how)501 m_prepend(struct mbuf *m, int len, int how)
502 {
503 struct mbuf *mn;
504
505 if (m->m_flags & M_PKTHDR)
506 mn = m_gethdr(how, m->m_type);
507 else
508 mn = m_get(how, m->m_type);
509 if (mn == NULL) {
510 m_freem(m);
511 return (NULL);
512 }
513 if (m->m_flags & M_PKTHDR)
514 m_move_pkthdr(mn, m);
515 mn->m_next = m;
516 m = mn;
517 if (len < M_SIZE(m))
518 M_ALIGN(m, len);
519 m->m_len = len;
520 return (m);
521 }
522
523 /*
524 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
525 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
526 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
527 * Note that the copy is read-only, because clusters are not copied,
528 * only their reference counts are incremented.
529 */
530 struct mbuf *
m_copym(struct mbuf * m,int off0,int len,int wait)531 m_copym(struct mbuf *m, int off0, int len, int wait)
532 {
533 struct mbuf *n, **np;
534 int off = off0;
535 struct mbuf *top;
536 int copyhdr = 0;
537
538 KASSERT(off >= 0, ("m_copym, negative off %d", off));
539 KASSERT(len >= 0, ("m_copym, negative len %d", len));
540 MBUF_CHECKSLEEP(wait);
541 if (off == 0 && m->m_flags & M_PKTHDR)
542 copyhdr = 1;
543 while (off > 0) {
544 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
545 if (off < m->m_len)
546 break;
547 off -= m->m_len;
548 m = m->m_next;
549 }
550 np = ⊤
551 top = NULL;
552 while (len > 0) {
553 if (m == NULL) {
554 KASSERT(len == M_COPYALL,
555 ("m_copym, length > size of mbuf chain"));
556 break;
557 }
558 if (copyhdr)
559 n = m_gethdr(wait, m->m_type);
560 else
561 n = m_get(wait, m->m_type);
562 *np = n;
563 if (n == NULL)
564 goto nospace;
565 if (copyhdr) {
566 if (!m_dup_pkthdr(n, m, wait))
567 goto nospace;
568 if (len == M_COPYALL)
569 n->m_pkthdr.len -= off0;
570 else
571 n->m_pkthdr.len = len;
572 copyhdr = 0;
573 }
574 n->m_len = min(len, m->m_len - off);
575 if (m->m_flags & (M_EXT | M_EXTPG)) {
576 n->m_data = m->m_data + off;
577 mb_dupcl(n, m);
578 } else
579 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
580 (u_int)n->m_len);
581 if (len != M_COPYALL)
582 len -= n->m_len;
583 off = 0;
584 m = m->m_next;
585 np = &n->m_next;
586 }
587
588 return (top);
589 nospace:
590 m_freem(top);
591 return (NULL);
592 }
593
594 /*
595 * Copy an entire packet, including header (which must be present).
596 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
597 * Note that the copy is read-only, because clusters are not copied,
598 * only their reference counts are incremented.
599 * Preserve alignment of the first mbuf so if the creator has left
600 * some room at the beginning (e.g. for inserting protocol headers)
601 * the copies still have the room available.
602 */
603 struct mbuf *
m_copypacket(struct mbuf * m,int how)604 m_copypacket(struct mbuf *m, int how)
605 {
606 struct mbuf *top, *n, *o;
607
608 MBUF_CHECKSLEEP(how);
609 n = m_get(how, m->m_type);
610 top = n;
611 if (n == NULL)
612 goto nospace;
613
614 if (!m_dup_pkthdr(n, m, how))
615 goto nospace;
616 n->m_len = m->m_len;
617 if (m->m_flags & (M_EXT | M_EXTPG)) {
618 n->m_data = m->m_data;
619 mb_dupcl(n, m);
620 } else {
621 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
622 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
623 }
624
625 m = m->m_next;
626 while (m) {
627 o = m_get(how, m->m_type);
628 if (o == NULL)
629 goto nospace;
630
631 n->m_next = o;
632 n = n->m_next;
633
634 n->m_len = m->m_len;
635 if (m->m_flags & (M_EXT | M_EXTPG)) {
636 n->m_data = m->m_data;
637 mb_dupcl(n, m);
638 } else {
639 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
640 }
641
642 m = m->m_next;
643 }
644 return top;
645 nospace:
646 m_freem(top);
647 return (NULL);
648 }
649
650 static void
m_copyfromunmapped(const struct mbuf * m,int off,int len,caddr_t cp)651 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
652 {
653 struct iovec iov;
654 struct uio uio;
655 int error __diagused;
656
657 KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
658 KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
659 KASSERT(off < m->m_len,
660 ("m_copyfromunmapped: len exceeds mbuf length"));
661 iov.iov_base = cp;
662 iov.iov_len = len;
663 uio.uio_resid = len;
664 uio.uio_iov = &iov;
665 uio.uio_segflg = UIO_SYSSPACE;
666 uio.uio_iovcnt = 1;
667 uio.uio_offset = 0;
668 uio.uio_rw = UIO_READ;
669 error = m_unmapped_uiomove(m, off, &uio, len);
670 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
671 len));
672 }
673
674 /*
675 * Copy data from an mbuf chain starting "off" bytes from the beginning,
676 * continuing for "len" bytes, into the indicated buffer.
677 */
678 void
m_copydata(const struct mbuf * m,int off,int len,caddr_t cp)679 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
680 {
681 u_int count;
682
683 KASSERT(off >= 0, ("m_copydata, negative off %d", off));
684 KASSERT(len >= 0, ("m_copydata, negative len %d", len));
685 while (off > 0) {
686 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
687 if (off < m->m_len)
688 break;
689 off -= m->m_len;
690 m = m->m_next;
691 }
692 while (len > 0) {
693 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
694 count = min(m->m_len - off, len);
695 if ((m->m_flags & M_EXTPG) != 0)
696 m_copyfromunmapped(m, off, count, cp);
697 else
698 bcopy(mtod(m, caddr_t) + off, cp, count);
699 len -= count;
700 cp += count;
701 off = 0;
702 m = m->m_next;
703 }
704 }
705
706 /*
707 * Copy a packet header mbuf chain into a completely new chain, including
708 * copying any mbuf clusters. Use this instead of m_copypacket() when
709 * you need a writable copy of an mbuf chain.
710 */
711 struct mbuf *
m_dup(const struct mbuf * m,int how)712 m_dup(const struct mbuf *m, int how)
713 {
714 struct mbuf **p, *top = NULL;
715 int remain, moff, nsize;
716
717 MBUF_CHECKSLEEP(how);
718 /* Sanity check */
719 if (m == NULL)
720 return (NULL);
721 M_ASSERTPKTHDR(m);
722
723 /* While there's more data, get a new mbuf, tack it on, and fill it */
724 remain = m->m_pkthdr.len;
725 moff = 0;
726 p = ⊤
727 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */
728 struct mbuf *n;
729
730 /* Get the next new mbuf */
731 if (remain >= MINCLSIZE) {
732 n = m_getcl(how, m->m_type, 0);
733 nsize = MCLBYTES;
734 } else {
735 n = m_get(how, m->m_type);
736 nsize = MLEN;
737 }
738 if (n == NULL)
739 goto nospace;
740
741 if (top == NULL) { /* First one, must be PKTHDR */
742 if (!m_dup_pkthdr(n, m, how)) {
743 m_free(n);
744 goto nospace;
745 }
746 if ((n->m_flags & M_EXT) == 0)
747 nsize = MHLEN;
748 n->m_flags &= ~M_RDONLY;
749 }
750 n->m_len = 0;
751
752 /* Link it into the new chain */
753 *p = n;
754 p = &n->m_next;
755
756 /* Copy data from original mbuf(s) into new mbuf */
757 while (n->m_len < nsize && m != NULL) {
758 int chunk = min(nsize - n->m_len, m->m_len - moff);
759
760 m_copydata(m, moff, chunk, n->m_data + n->m_len);
761 moff += chunk;
762 n->m_len += chunk;
763 remain -= chunk;
764 if (moff == m->m_len) {
765 m = m->m_next;
766 moff = 0;
767 }
768 }
769
770 /* Check correct total mbuf length */
771 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
772 ("%s: bogus m_pkthdr.len", __func__));
773 }
774 return (top);
775
776 nospace:
777 m_freem(top);
778 return (NULL);
779 }
780
781 /*
782 * Concatenate mbuf chain n to m.
783 * Both chains must be of the same type (e.g. MT_DATA).
784 * Any m_pkthdr is not updated.
785 */
786 void
m_cat(struct mbuf * m,struct mbuf * n)787 m_cat(struct mbuf *m, struct mbuf *n)
788 {
789 while (m->m_next)
790 m = m->m_next;
791 while (n) {
792 if (!M_WRITABLE(m) ||
793 (n->m_flags & M_EXTPG) != 0 ||
794 M_TRAILINGSPACE(m) < n->m_len) {
795 /* just join the two chains */
796 m->m_next = n;
797 return;
798 }
799 /* splat the data from one into the other */
800 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
801 (u_int)n->m_len);
802 m->m_len += n->m_len;
803 n = m_free(n);
804 }
805 }
806
807 /*
808 * Concatenate two pkthdr mbuf chains.
809 */
810 void
m_catpkt(struct mbuf * m,struct mbuf * n)811 m_catpkt(struct mbuf *m, struct mbuf *n)
812 {
813
814 M_ASSERTPKTHDR(m);
815 M_ASSERTPKTHDR(n);
816
817 m->m_pkthdr.len += n->m_pkthdr.len;
818 m_demote(n, 1, 0);
819
820 m_cat(m, n);
821 }
822
823 void
m_adj(struct mbuf * mp,int req_len)824 m_adj(struct mbuf *mp, int req_len)
825 {
826 int len = req_len;
827 struct mbuf *m;
828 int count;
829
830 if ((m = mp) == NULL)
831 return;
832 if (len >= 0) {
833 /*
834 * Trim from head.
835 */
836 while (m != NULL && len > 0) {
837 if (m->m_len <= len) {
838 len -= m->m_len;
839 m->m_len = 0;
840 m = m->m_next;
841 } else {
842 m->m_len -= len;
843 m->m_data += len;
844 len = 0;
845 }
846 }
847 if (mp->m_flags & M_PKTHDR)
848 mp->m_pkthdr.len -= (req_len - len);
849 } else {
850 /*
851 * Trim from tail. Scan the mbuf chain,
852 * calculating its length and finding the last mbuf.
853 * If the adjustment only affects this mbuf, then just
854 * adjust and return. Otherwise, rescan and truncate
855 * after the remaining size.
856 */
857 len = -len;
858 count = 0;
859 for (;;) {
860 count += m->m_len;
861 if (m->m_next == (struct mbuf *)0)
862 break;
863 m = m->m_next;
864 }
865 if (m->m_len >= len) {
866 m->m_len -= len;
867 if (mp->m_flags & M_PKTHDR)
868 mp->m_pkthdr.len -= len;
869 return;
870 }
871 count -= len;
872 if (count < 0)
873 count = 0;
874 /*
875 * Correct length for chain is "count".
876 * Find the mbuf with last data, adjust its length,
877 * and toss data from remaining mbufs on chain.
878 */
879 m = mp;
880 if (m->m_flags & M_PKTHDR)
881 m->m_pkthdr.len = count;
882 for (; m; m = m->m_next) {
883 if (m->m_len >= count) {
884 m->m_len = count;
885 if (m->m_next != NULL) {
886 m_freem(m->m_next);
887 m->m_next = NULL;
888 }
889 break;
890 }
891 count -= m->m_len;
892 }
893 }
894 }
895
896 void
m_adj_decap(struct mbuf * mp,int len)897 m_adj_decap(struct mbuf *mp, int len)
898 {
899 uint8_t rsstype;
900
901 m_adj(mp, len);
902 if ((mp->m_flags & M_PKTHDR) != 0) {
903 /*
904 * If flowid was calculated by card from the inner
905 * headers, move flowid to the decapsulated mbuf
906 * chain, otherwise clear. This depends on the
907 * internals of m_adj, which keeps pkthdr as is, in
908 * particular not changing rsstype and flowid.
909 */
910 rsstype = mp->m_pkthdr.rsstype;
911 if ((rsstype & M_HASHTYPE_INNER) != 0) {
912 M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER);
913 } else {
914 M_HASHTYPE_CLEAR(mp);
915 }
916 }
917 }
918
919 /*
920 * Rearange an mbuf chain so that len bytes are contiguous
921 * and in the data area of an mbuf (so that mtod will work
922 * for a structure of size len). Returns the resulting
923 * mbuf chain on success, frees it and returns null on failure.
924 * If there is room, it will add up to max_protohdr-len extra bytes to the
925 * contiguous region in an attempt to avoid being called next time.
926 */
927 struct mbuf *
m_pullup(struct mbuf * n,int len)928 m_pullup(struct mbuf *n, int len)
929 {
930 struct mbuf *m;
931 int count;
932 int space;
933
934 KASSERT((n->m_flags & M_EXTPG) == 0,
935 ("%s: unmapped mbuf %p", __func__, n));
936
937 /*
938 * If first mbuf has no cluster, and has room for len bytes
939 * without shifting current data, pullup into it,
940 * otherwise allocate a new mbuf to prepend to the chain.
941 */
942 if ((n->m_flags & M_EXT) == 0 &&
943 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
944 if (n->m_len >= len)
945 return (n);
946 m = n;
947 n = n->m_next;
948 len -= m->m_len;
949 } else {
950 if (len > MHLEN)
951 goto bad;
952 m = m_get(M_NOWAIT, n->m_type);
953 if (m == NULL)
954 goto bad;
955 if (n->m_flags & M_PKTHDR)
956 m_move_pkthdr(m, n);
957 }
958 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
959 do {
960 count = min(min(max(len, max_protohdr), space), n->m_len);
961 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
962 (u_int)count);
963 len -= count;
964 m->m_len += count;
965 n->m_len -= count;
966 space -= count;
967 if (n->m_len)
968 n->m_data += count;
969 else
970 n = m_free(n);
971 } while (len > 0 && n);
972 if (len > 0) {
973 (void) m_free(m);
974 goto bad;
975 }
976 m->m_next = n;
977 return (m);
978 bad:
979 m_freem(n);
980 return (NULL);
981 }
982
983 /*
984 * Like m_pullup(), except a new mbuf is always allocated, and we allow
985 * the amount of empty space before the data in the new mbuf to be specified
986 * (in the event that the caller expects to prepend later).
987 */
988 struct mbuf *
m_copyup(struct mbuf * n,int len,int dstoff)989 m_copyup(struct mbuf *n, int len, int dstoff)
990 {
991 struct mbuf *m;
992 int count, space;
993
994 if (len > (MHLEN - dstoff))
995 goto bad;
996 m = m_get(M_NOWAIT, n->m_type);
997 if (m == NULL)
998 goto bad;
999 if (n->m_flags & M_PKTHDR)
1000 m_move_pkthdr(m, n);
1001 m->m_data += dstoff;
1002 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1003 do {
1004 count = min(min(max(len, max_protohdr), space), n->m_len);
1005 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
1006 (unsigned)count);
1007 len -= count;
1008 m->m_len += count;
1009 n->m_len -= count;
1010 space -= count;
1011 if (n->m_len)
1012 n->m_data += count;
1013 else
1014 n = m_free(n);
1015 } while (len > 0 && n);
1016 if (len > 0) {
1017 (void) m_free(m);
1018 goto bad;
1019 }
1020 m->m_next = n;
1021 return (m);
1022 bad:
1023 m_freem(n);
1024 return (NULL);
1025 }
1026
1027 /*
1028 * Partition an mbuf chain in two pieces, returning the tail --
1029 * all but the first len0 bytes. In case of failure, it returns NULL and
1030 * attempts to restore the chain to its original state.
1031 *
1032 * Note that the resulting mbufs might be read-only, because the new
1033 * mbuf can end up sharing an mbuf cluster with the original mbuf if
1034 * the "breaking point" happens to lie within a cluster mbuf. Use the
1035 * M_WRITABLE() macro to check for this case.
1036 */
1037 struct mbuf *
m_split(struct mbuf * m0,int len0,int wait)1038 m_split(struct mbuf *m0, int len0, int wait)
1039 {
1040 struct mbuf *m, *n;
1041 u_int len = len0, remain;
1042
1043 MBUF_CHECKSLEEP(wait);
1044 for (m = m0; m && len > m->m_len; m = m->m_next)
1045 len -= m->m_len;
1046 if (m == NULL)
1047 return (NULL);
1048 remain = m->m_len - len;
1049 if (m0->m_flags & M_PKTHDR && remain == 0) {
1050 n = m_gethdr(wait, m0->m_type);
1051 if (n == NULL)
1052 return (NULL);
1053 n->m_next = m->m_next;
1054 m->m_next = NULL;
1055 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1056 n->m_pkthdr.snd_tag =
1057 m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1058 n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1059 } else
1060 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1061 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1062 m0->m_pkthdr.len = len0;
1063 return (n);
1064 } else if (m0->m_flags & M_PKTHDR) {
1065 n = m_gethdr(wait, m0->m_type);
1066 if (n == NULL)
1067 return (NULL);
1068 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1069 n->m_pkthdr.snd_tag =
1070 m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1071 n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1072 } else
1073 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1074 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1075 m0->m_pkthdr.len = len0;
1076 if (m->m_flags & (M_EXT | M_EXTPG))
1077 goto extpacket;
1078 if (remain > MHLEN) {
1079 /* m can't be the lead packet */
1080 M_ALIGN(n, 0);
1081 n->m_next = m_split(m, len, wait);
1082 if (n->m_next == NULL) {
1083 (void) m_free(n);
1084 return (NULL);
1085 } else {
1086 n->m_len = 0;
1087 return (n);
1088 }
1089 } else
1090 M_ALIGN(n, remain);
1091 } else if (remain == 0) {
1092 n = m->m_next;
1093 m->m_next = NULL;
1094 return (n);
1095 } else {
1096 n = m_get(wait, m->m_type);
1097 if (n == NULL)
1098 return (NULL);
1099 M_ALIGN(n, remain);
1100 }
1101 extpacket:
1102 if (m->m_flags & (M_EXT | M_EXTPG)) {
1103 n->m_data = m->m_data + len;
1104 mb_dupcl(n, m);
1105 } else {
1106 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1107 }
1108 n->m_len = remain;
1109 m->m_len = len;
1110 n->m_next = m->m_next;
1111 m->m_next = NULL;
1112 return (n);
1113 }
1114
1115 /*
1116 * Partition mchain in two pieces, keeping len0 bytes in head and transferring
1117 * remainder to tail. In case of failure, both chains to be left untouched.
1118 * M_EOR is observed correctly.
1119 * Resulting mbufs might be read-only.
1120 */
1121 int
mc_split(struct mchain * head,struct mchain * tail,u_int len0,int wait)1122 mc_split(struct mchain *head, struct mchain *tail, u_int len0, int wait)
1123 {
1124 struct mbuf *m, *n;
1125 u_int len, mlen, remain;
1126
1127 MPASS(!(mc_first(head)->m_flags & M_PKTHDR));
1128 MBUF_CHECKSLEEP(wait);
1129
1130 mlen = 0;
1131 len = len0;
1132 STAILQ_FOREACH(m, &head->mc_q, m_stailq) {
1133 mlen += MSIZE;
1134 if (m->m_flags & M_EXT)
1135 mlen += m->m_ext.ext_size;
1136 if (len > m->m_len)
1137 len -= m->m_len;
1138 else
1139 break;
1140 }
1141 if (__predict_false(m == NULL)) {
1142 *tail = MCHAIN_INITIALIZER(tail);
1143 return (0);
1144 }
1145 remain = m->m_len - len;
1146 if (remain > 0) {
1147 if (__predict_false((n = m_get(wait, m->m_type)) == NULL))
1148 return (ENOMEM);
1149 m_align(n, remain);
1150 if (m->m_flags & M_EXT) {
1151 n->m_data = m->m_data + len;
1152 mb_dupcl(n, m);
1153 } else
1154 bcopy(mtod(m, char *) + len, mtod(n, char *), remain);
1155 }
1156
1157 /* XXXGL: need STAILQ_SPLIT */
1158 STAILQ_FIRST(&tail->mc_q) = STAILQ_NEXT(m, m_stailq);
1159 tail->mc_q.stqh_last = head->mc_q.stqh_last;
1160 tail->mc_len = head->mc_len - len0;
1161 tail->mc_mlen = head->mc_mlen - mlen;
1162 if (remain > 0) {
1163 MPASS(n->m_len == 0);
1164 mc_prepend(tail, n);
1165 n->m_len = remain;
1166 m->m_len -= remain;
1167 if (m->m_flags & M_EOR) {
1168 m->m_flags &= ~M_EOR;
1169 n->m_flags |= M_EOR;
1170 }
1171 }
1172 head->mc_q.stqh_last = &STAILQ_NEXT(m, m_stailq);
1173 STAILQ_NEXT(m, m_stailq) = NULL;
1174 head->mc_len = len0;
1175 head->mc_mlen = mlen;
1176
1177 return (0);
1178 }
1179
1180 /*
1181 * Routine to copy from device local memory into mbufs.
1182 * Note that `off' argument is offset into first mbuf of target chain from
1183 * which to begin copying the data to.
1184 */
1185 struct mbuf *
m_devget(char * buf,int totlen,int off,struct ifnet * ifp,void (* copy)(char * from,caddr_t to,u_int len))1186 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
1187 void (*copy)(char *from, caddr_t to, u_int len))
1188 {
1189 struct mbuf *m;
1190 struct mbuf *top = NULL, **mp = ⊤
1191 int len;
1192
1193 if (off < 0 || off > MHLEN)
1194 return (NULL);
1195
1196 while (totlen > 0) {
1197 if (top == NULL) { /* First one, must be PKTHDR */
1198 if (totlen + off >= MINCLSIZE) {
1199 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1200 len = MCLBYTES;
1201 } else {
1202 m = m_gethdr(M_NOWAIT, MT_DATA);
1203 len = MHLEN;
1204
1205 /* Place initial small packet/header at end of mbuf */
1206 if (m && totlen + off + max_linkhdr <= MHLEN) {
1207 m->m_data += max_linkhdr;
1208 len -= max_linkhdr;
1209 }
1210 }
1211 if (m == NULL)
1212 return NULL;
1213 m->m_pkthdr.rcvif = ifp;
1214 m->m_pkthdr.len = totlen;
1215 } else {
1216 if (totlen + off >= MINCLSIZE) {
1217 m = m_getcl(M_NOWAIT, MT_DATA, 0);
1218 len = MCLBYTES;
1219 } else {
1220 m = m_get(M_NOWAIT, MT_DATA);
1221 len = MLEN;
1222 }
1223 if (m == NULL) {
1224 m_freem(top);
1225 return NULL;
1226 }
1227 }
1228 if (off) {
1229 m->m_data += off;
1230 len -= off;
1231 off = 0;
1232 }
1233 m->m_len = len = min(totlen, len);
1234 if (copy)
1235 copy(buf, mtod(m, caddr_t), (u_int)len);
1236 else
1237 bcopy(buf, mtod(m, caddr_t), (u_int)len);
1238 buf += len;
1239 *mp = m;
1240 mp = &m->m_next;
1241 totlen -= len;
1242 }
1243 return (top);
1244 }
1245
1246 static void
m_copytounmapped(const struct mbuf * m,int off,int len,c_caddr_t cp)1247 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp)
1248 {
1249 struct iovec iov;
1250 struct uio uio;
1251 int error __diagused;
1252
1253 KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off));
1254 KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len));
1255 KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length"));
1256 iov.iov_base = __DECONST(caddr_t, cp);
1257 iov.iov_len = len;
1258 uio.uio_resid = len;
1259 uio.uio_iov = &iov;
1260 uio.uio_segflg = UIO_SYSSPACE;
1261 uio.uio_iovcnt = 1;
1262 uio.uio_offset = 0;
1263 uio.uio_rw = UIO_WRITE;
1264 error = m_unmapped_uiomove(m, off, &uio, len);
1265 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
1266 len));
1267 }
1268
1269 /*
1270 * Copy data from a buffer back into the indicated mbuf chain,
1271 * starting "off" bytes from the beginning, extending the mbuf
1272 * chain if necessary.
1273 */
1274 void
m_copyback(struct mbuf * m0,int off,int len,c_caddr_t cp)1275 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
1276 {
1277 int mlen;
1278 struct mbuf *m = m0, *n;
1279 int totlen = 0;
1280
1281 if (m0 == NULL)
1282 return;
1283 while (off > (mlen = m->m_len)) {
1284 off -= mlen;
1285 totlen += mlen;
1286 if (m->m_next == NULL) {
1287 n = m_get(M_NOWAIT, m->m_type);
1288 if (n == NULL)
1289 goto out;
1290 bzero(mtod(n, caddr_t), MLEN);
1291 n->m_len = min(MLEN, len + off);
1292 m->m_next = n;
1293 }
1294 m = m->m_next;
1295 }
1296 while (len > 0) {
1297 if (m->m_next == NULL && (len > m->m_len - off)) {
1298 m->m_len += min(len - (m->m_len - off),
1299 M_TRAILINGSPACE(m));
1300 }
1301 mlen = min (m->m_len - off, len);
1302 if ((m->m_flags & M_EXTPG) != 0)
1303 m_copytounmapped(m, off, mlen, cp);
1304 else
1305 bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1306 cp += mlen;
1307 len -= mlen;
1308 mlen += off;
1309 off = 0;
1310 totlen += mlen;
1311 if (len == 0)
1312 break;
1313 if (m->m_next == NULL) {
1314 n = m_get(M_NOWAIT, m->m_type);
1315 if (n == NULL)
1316 break;
1317 n->m_len = min(MLEN, len);
1318 m->m_next = n;
1319 }
1320 m = m->m_next;
1321 }
1322 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1323 m->m_pkthdr.len = totlen;
1324 }
1325
1326 /*
1327 * Append the specified data to the indicated mbuf chain,
1328 * Extend the mbuf chain if the new data does not fit in
1329 * existing space.
1330 *
1331 * Return 1 if able to complete the job; otherwise 0.
1332 */
1333 int
m_append(struct mbuf * m0,int len,c_caddr_t cp)1334 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1335 {
1336 struct mbuf *m, *n;
1337 int remainder, space;
1338
1339 for (m = m0; m->m_next != NULL; m = m->m_next)
1340 ;
1341 remainder = len;
1342 space = M_TRAILINGSPACE(m);
1343 if (space > 0) {
1344 /*
1345 * Copy into available space.
1346 */
1347 if (space > remainder)
1348 space = remainder;
1349 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1350 m->m_len += space;
1351 cp += space, remainder -= space;
1352 }
1353 while (remainder > 0) {
1354 /*
1355 * Allocate a new mbuf; could check space
1356 * and allocate a cluster instead.
1357 */
1358 n = m_get(M_NOWAIT, m->m_type);
1359 if (n == NULL)
1360 break;
1361 n->m_len = min(MLEN, remainder);
1362 bcopy(cp, mtod(n, caddr_t), n->m_len);
1363 cp += n->m_len, remainder -= n->m_len;
1364 m->m_next = n;
1365 m = n;
1366 }
1367 if (m0->m_flags & M_PKTHDR)
1368 m0->m_pkthdr.len += len - remainder;
1369 return (remainder == 0);
1370 }
1371
1372 static int
m_apply_extpg_one(struct mbuf * m,int off,int len,int (* f)(void *,void *,u_int),void * arg)1373 m_apply_extpg_one(struct mbuf *m, int off, int len,
1374 int (*f)(void *, void *, u_int), void *arg)
1375 {
1376 void *p;
1377 u_int i, count, pgoff, pglen;
1378 int rval;
1379
1380 KASSERT(PMAP_HAS_DMAP,
1381 ("m_apply_extpg_one does not support unmapped mbufs"));
1382 off += mtod(m, vm_offset_t);
1383 if (off < m->m_epg_hdrlen) {
1384 count = min(m->m_epg_hdrlen - off, len);
1385 rval = f(arg, m->m_epg_hdr + off, count);
1386 if (rval)
1387 return (rval);
1388 len -= count;
1389 off = 0;
1390 } else
1391 off -= m->m_epg_hdrlen;
1392 pgoff = m->m_epg_1st_off;
1393 for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
1394 pglen = m_epg_pagelen(m, i, pgoff);
1395 if (off < pglen) {
1396 count = min(pglen - off, len);
1397 p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off);
1398 rval = f(arg, p, count);
1399 if (rval)
1400 return (rval);
1401 len -= count;
1402 off = 0;
1403 } else
1404 off -= pglen;
1405 pgoff = 0;
1406 }
1407 if (len > 0) {
1408 KASSERT(off < m->m_epg_trllen,
1409 ("m_apply_extpg_one: offset beyond trailer"));
1410 KASSERT(len <= m->m_epg_trllen - off,
1411 ("m_apply_extpg_one: length beyond trailer"));
1412 return (f(arg, m->m_epg_trail + off, len));
1413 }
1414 return (0);
1415 }
1416
1417 /* Apply function f to the data in a single mbuf. */
1418 static int
m_apply_one(struct mbuf * m,int off,int len,int (* f)(void *,void *,u_int),void * arg)1419 m_apply_one(struct mbuf *m, int off, int len,
1420 int (*f)(void *, void *, u_int), void *arg)
1421 {
1422 if ((m->m_flags & M_EXTPG) != 0)
1423 return (m_apply_extpg_one(m, off, len, f, arg));
1424 else
1425 return (f(arg, mtod(m, caddr_t) + off, len));
1426 }
1427
1428 /*
1429 * Apply function f to the data in an mbuf chain starting "off" bytes from
1430 * the beginning, continuing for "len" bytes.
1431 */
1432 int
m_apply(struct mbuf * m,int off,int len,int (* f)(void *,void *,u_int),void * arg)1433 m_apply(struct mbuf *m, int off, int len,
1434 int (*f)(void *, void *, u_int), void *arg)
1435 {
1436 u_int count;
1437 int rval;
1438
1439 KASSERT(off >= 0, ("m_apply, negative off %d", off));
1440 KASSERT(len >= 0, ("m_apply, negative len %d", len));
1441 while (off > 0) {
1442 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1443 if (off < m->m_len)
1444 break;
1445 off -= m->m_len;
1446 m = m->m_next;
1447 }
1448 while (len > 0) {
1449 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1450 count = min(m->m_len - off, len);
1451 rval = m_apply_one(m, off, count, f, arg);
1452 if (rval)
1453 return (rval);
1454 len -= count;
1455 off = 0;
1456 m = m->m_next;
1457 }
1458 return (0);
1459 }
1460
1461 /*
1462 * Return a pointer to mbuf/offset of location in mbuf chain.
1463 */
1464 struct mbuf *
m_getptr(struct mbuf * m,int loc,int * off)1465 m_getptr(struct mbuf *m, int loc, int *off)
1466 {
1467
1468 while (loc >= 0) {
1469 /* Normal end of search. */
1470 if (m->m_len > loc) {
1471 *off = loc;
1472 return (m);
1473 } else {
1474 loc -= m->m_len;
1475 if (m->m_next == NULL) {
1476 if (loc == 0) {
1477 /* Point at the end of valid data. */
1478 *off = m->m_len;
1479 return (m);
1480 }
1481 return (NULL);
1482 }
1483 m = m->m_next;
1484 }
1485 }
1486 return (NULL);
1487 }
1488
1489 void
m_print(const struct mbuf * m,int maxlen)1490 m_print(const struct mbuf *m, int maxlen)
1491 {
1492 int len;
1493 int pdata;
1494 const struct mbuf *m2;
1495
1496 if (m == NULL) {
1497 printf("mbuf: %p\n", m);
1498 return;
1499 }
1500
1501 if (m->m_flags & M_PKTHDR)
1502 len = m->m_pkthdr.len;
1503 else
1504 len = -1;
1505 m2 = m;
1506 while (m2 != NULL && (len == -1 || len)) {
1507 pdata = m2->m_len;
1508 if (maxlen != -1 && pdata > maxlen)
1509 pdata = maxlen;
1510 printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1511 m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1512 "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1513 "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1514 if (pdata)
1515 printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1516 if (len != -1)
1517 len -= m2->m_len;
1518 m2 = m2->m_next;
1519 }
1520 if (len > 0)
1521 printf("%d bytes unaccounted for.\n", len);
1522 return;
1523 }
1524
1525 u_int
m_fixhdr(struct mbuf * m0)1526 m_fixhdr(struct mbuf *m0)
1527 {
1528 u_int len;
1529
1530 len = m_length(m0, NULL);
1531 m0->m_pkthdr.len = len;
1532 return (len);
1533 }
1534
1535 u_int
m_length(struct mbuf * m0,struct mbuf ** last)1536 m_length(struct mbuf *m0, struct mbuf **last)
1537 {
1538 struct mbuf *m;
1539 u_int len;
1540
1541 len = 0;
1542 for (m = m0; m != NULL; m = m->m_next) {
1543 len += m->m_len;
1544 if (m->m_next == NULL)
1545 break;
1546 }
1547 if (last != NULL)
1548 *last = m;
1549 return (len);
1550 }
1551
1552 /*
1553 * Defragment a mbuf chain, returning the shortest possible
1554 * chain of mbufs and clusters. If allocation fails and
1555 * this cannot be completed, NULL will be returned, but
1556 * the passed in chain will be unchanged. Upon success,
1557 * the original chain will be freed, and the new chain
1558 * will be returned.
1559 *
1560 * If a non-packet header is passed in, the original
1561 * mbuf (chain?) will be returned unharmed.
1562 */
1563 struct mbuf *
m_defrag(struct mbuf * m0,int how)1564 m_defrag(struct mbuf *m0, int how)
1565 {
1566 struct mbuf *m_new = NULL, *m_final = NULL;
1567 int progress = 0, length;
1568
1569 MBUF_CHECKSLEEP(how);
1570 if (!(m0->m_flags & M_PKTHDR))
1571 return (m0);
1572
1573 m_fixhdr(m0); /* Needed sanity check */
1574
1575 #ifdef MBUF_STRESS_TEST
1576 if (m_defragrandomfailures) {
1577 int temp = arc4random() & 0xff;
1578 if (temp == 0xba)
1579 goto nospace;
1580 }
1581 #endif
1582
1583 if (m0->m_pkthdr.len > MHLEN)
1584 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1585 else
1586 m_final = m_gethdr(how, MT_DATA);
1587
1588 if (m_final == NULL)
1589 goto nospace;
1590
1591 if (m_dup_pkthdr(m_final, m0, how) == 0)
1592 goto nospace;
1593
1594 m_new = m_final;
1595
1596 while (progress < m0->m_pkthdr.len) {
1597 length = m0->m_pkthdr.len - progress;
1598 if (length > MCLBYTES)
1599 length = MCLBYTES;
1600
1601 if (m_new == NULL) {
1602 if (length > MLEN)
1603 m_new = m_getcl(how, MT_DATA, 0);
1604 else
1605 m_new = m_get(how, MT_DATA);
1606 if (m_new == NULL)
1607 goto nospace;
1608 }
1609
1610 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1611 progress += length;
1612 m_new->m_len = length;
1613 if (m_new != m_final)
1614 m_cat(m_final, m_new);
1615 m_new = NULL;
1616 }
1617 #ifdef MBUF_STRESS_TEST
1618 if (m0->m_next == NULL)
1619 m_defraguseless++;
1620 #endif
1621 m_freem(m0);
1622 m0 = m_final;
1623 #ifdef MBUF_STRESS_TEST
1624 m_defragpackets++;
1625 m_defragbytes += m0->m_pkthdr.len;
1626 #endif
1627 return (m0);
1628 nospace:
1629 #ifdef MBUF_STRESS_TEST
1630 m_defragfailure++;
1631 #endif
1632 if (m_final)
1633 m_freem(m_final);
1634 return (NULL);
1635 }
1636
1637 /*
1638 * Return the number of fragments an mbuf will use. This is usually
1639 * used as a proxy for the number of scatter/gather elements needed by
1640 * a DMA engine to access an mbuf. In general mapped mbufs are
1641 * assumed to be backed by physically contiguous buffers that only
1642 * need a single fragment. Unmapped mbufs, on the other hand, can
1643 * span disjoint physical pages.
1644 */
1645 static int
frags_per_mbuf(struct mbuf * m)1646 frags_per_mbuf(struct mbuf *m)
1647 {
1648 int frags;
1649
1650 if ((m->m_flags & M_EXTPG) == 0)
1651 return (1);
1652
1653 /*
1654 * The header and trailer are counted as a single fragment
1655 * each when present.
1656 *
1657 * XXX: This overestimates the number of fragments by assuming
1658 * all the backing physical pages are disjoint.
1659 */
1660 frags = 0;
1661 if (m->m_epg_hdrlen != 0)
1662 frags++;
1663 frags += m->m_epg_npgs;
1664 if (m->m_epg_trllen != 0)
1665 frags++;
1666
1667 return (frags);
1668 }
1669
1670 /*
1671 * Defragment an mbuf chain, returning at most maxfrags separate
1672 * mbufs+clusters. If this is not possible NULL is returned and
1673 * the original mbuf chain is left in its present (potentially
1674 * modified) state. We use two techniques: collapsing consecutive
1675 * mbufs and replacing consecutive mbufs by a cluster.
1676 *
1677 * NB: this should really be named m_defrag but that name is taken
1678 */
1679 struct mbuf *
m_collapse(struct mbuf * m0,int how,int maxfrags)1680 m_collapse(struct mbuf *m0, int how, int maxfrags)
1681 {
1682 struct mbuf *m, *n, *n2, **prev;
1683 u_int curfrags;
1684
1685 /*
1686 * Calculate the current number of frags.
1687 */
1688 curfrags = 0;
1689 for (m = m0; m != NULL; m = m->m_next)
1690 curfrags += frags_per_mbuf(m);
1691 /*
1692 * First, try to collapse mbufs. Note that we always collapse
1693 * towards the front so we don't need to deal with moving the
1694 * pkthdr. This may be suboptimal if the first mbuf has much
1695 * less data than the following.
1696 */
1697 m = m0;
1698 again:
1699 for (;;) {
1700 n = m->m_next;
1701 if (n == NULL)
1702 break;
1703 if (M_WRITABLE(m) &&
1704 n->m_len < M_TRAILINGSPACE(m)) {
1705 m_copydata(n, 0, n->m_len,
1706 mtod(m, char *) + m->m_len);
1707 m->m_len += n->m_len;
1708 m->m_next = n->m_next;
1709 curfrags -= frags_per_mbuf(n);
1710 m_free(n);
1711 if (curfrags <= maxfrags)
1712 return m0;
1713 } else
1714 m = n;
1715 }
1716 KASSERT(maxfrags > 1,
1717 ("maxfrags %u, but normal collapse failed", maxfrags));
1718 /*
1719 * Collapse consecutive mbufs to a cluster.
1720 */
1721 prev = &m0->m_next; /* NB: not the first mbuf */
1722 while ((n = *prev) != NULL) {
1723 if ((n2 = n->m_next) != NULL &&
1724 n->m_len + n2->m_len < MCLBYTES) {
1725 m = m_getcl(how, MT_DATA, 0);
1726 if (m == NULL)
1727 goto bad;
1728 m_copydata(n, 0, n->m_len, mtod(m, char *));
1729 m_copydata(n2, 0, n2->m_len,
1730 mtod(m, char *) + n->m_len);
1731 m->m_len = n->m_len + n2->m_len;
1732 m->m_next = n2->m_next;
1733 *prev = m;
1734 curfrags += 1; /* For the new cluster */
1735 curfrags -= frags_per_mbuf(n);
1736 curfrags -= frags_per_mbuf(n2);
1737 m_free(n);
1738 m_free(n2);
1739 if (curfrags <= maxfrags)
1740 return m0;
1741 /*
1742 * Still not there, try the normal collapse
1743 * again before we allocate another cluster.
1744 */
1745 goto again;
1746 }
1747 prev = &n->m_next;
1748 }
1749 /*
1750 * No place where we can collapse to a cluster; punt.
1751 * This can occur if, for example, you request 2 frags
1752 * but the packet requires that both be clusters (we
1753 * never reallocate the first mbuf to avoid moving the
1754 * packet header).
1755 */
1756 bad:
1757 return NULL;
1758 }
1759
1760 #ifdef MBUF_STRESS_TEST
1761
1762 /*
1763 * Fragment an mbuf chain. There's no reason you'd ever want to do
1764 * this in normal usage, but it's great for stress testing various
1765 * mbuf consumers.
1766 *
1767 * If fragmentation is not possible, the original chain will be
1768 * returned.
1769 *
1770 * Possible length values:
1771 * 0 no fragmentation will occur
1772 * > 0 each fragment will be of the specified length
1773 * -1 each fragment will be the same random value in length
1774 * -2 each fragment's length will be entirely random
1775 * (Random values range from 1 to 256)
1776 */
1777 struct mbuf *
m_fragment(struct mbuf * m0,int how,int length)1778 m_fragment(struct mbuf *m0, int how, int length)
1779 {
1780 struct mbuf *m_first, *m_last;
1781 int divisor = 255, progress = 0, fraglen;
1782
1783 if (!(m0->m_flags & M_PKTHDR))
1784 return (m0);
1785
1786 if (length == 0 || length < -2)
1787 return (m0);
1788 if (length > MCLBYTES)
1789 length = MCLBYTES;
1790 if (length < 0 && divisor > MCLBYTES)
1791 divisor = MCLBYTES;
1792 if (length == -1)
1793 length = 1 + (arc4random() % divisor);
1794 if (length > 0)
1795 fraglen = length;
1796
1797 m_fixhdr(m0); /* Needed sanity check */
1798
1799 m_first = m_getcl(how, MT_DATA, M_PKTHDR);
1800 if (m_first == NULL)
1801 goto nospace;
1802
1803 if (m_dup_pkthdr(m_first, m0, how) == 0)
1804 goto nospace;
1805
1806 m_last = m_first;
1807
1808 while (progress < m0->m_pkthdr.len) {
1809 if (length == -2)
1810 fraglen = 1 + (arc4random() % divisor);
1811 if (fraglen > m0->m_pkthdr.len - progress)
1812 fraglen = m0->m_pkthdr.len - progress;
1813
1814 if (progress != 0) {
1815 struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
1816 if (m_new == NULL)
1817 goto nospace;
1818
1819 m_last->m_next = m_new;
1820 m_last = m_new;
1821 }
1822
1823 m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
1824 progress += fraglen;
1825 m_last->m_len = fraglen;
1826 }
1827 m_freem(m0);
1828 m0 = m_first;
1829 return (m0);
1830 nospace:
1831 if (m_first)
1832 m_freem(m_first);
1833 /* Return the original chain on failure */
1834 return (m0);
1835 }
1836
1837 #endif
1838
1839 /*
1840 * Free pages from mbuf_ext_pgs, assuming they were allocated via
1841 * vm_page_alloc() and aren't associated with any object. Complement
1842 * to allocator from m_uiotombuf_nomap().
1843 */
1844 void
mb_free_mext_pgs(struct mbuf * m)1845 mb_free_mext_pgs(struct mbuf *m)
1846 {
1847 vm_page_t pg;
1848
1849 M_ASSERTEXTPG(m);
1850 for (int i = 0; i < m->m_epg_npgs; i++) {
1851 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
1852 vm_page_unwire_noq(pg);
1853 vm_page_free(pg);
1854 }
1855 }
1856
1857 static struct mbuf *
m_uiotombuf_nomap(struct uio * uio,int how,int len,int maxseg,int flags)1858 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
1859 {
1860 struct mbuf *m, *mb, *prev;
1861 vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
1862 int error, length, i, needed;
1863 ssize_t total;
1864 int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED;
1865
1866 MPASS((flags & M_PKTHDR) == 0);
1867 MPASS((how & M_ZERO) == 0);
1868
1869 /*
1870 * len can be zero or an arbitrary large value bound by
1871 * the total data supplied by the uio.
1872 */
1873 if (len > 0)
1874 total = MIN(uio->uio_resid, len);
1875 else
1876 total = uio->uio_resid;
1877
1878 if (maxseg == 0)
1879 maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
1880
1881 /*
1882 * If total is zero, return an empty mbuf. This can occur
1883 * for TLS 1.0 connections which send empty fragments as
1884 * a countermeasure against the known-IV weakness in CBC
1885 * ciphersuites.
1886 */
1887 if (__predict_false(total == 0)) {
1888 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
1889 if (mb == NULL)
1890 return (NULL);
1891 mb->m_epg_flags = EPG_FLAG_ANON;
1892 return (mb);
1893 }
1894
1895 /*
1896 * Allocate the pages
1897 */
1898 m = NULL;
1899 while (total > 0) {
1900 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
1901 if (mb == NULL)
1902 goto failed;
1903 if (m == NULL)
1904 m = mb;
1905 else
1906 prev->m_next = mb;
1907 prev = mb;
1908 mb->m_epg_flags = EPG_FLAG_ANON;
1909 needed = length = MIN(maxseg, total);
1910 for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
1911 retry_page:
1912 pg_array[i] = vm_page_alloc_noobj(pflags);
1913 if (pg_array[i] == NULL) {
1914 if (how & M_NOWAIT) {
1915 goto failed;
1916 } else {
1917 vm_wait(NULL);
1918 goto retry_page;
1919 }
1920 }
1921 mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
1922 mb->m_epg_npgs++;
1923 }
1924 mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1);
1925 MBUF_EXT_PGS_ASSERT_SANITY(mb);
1926 total -= length;
1927 error = uiomove_fromphys(pg_array, 0, length, uio);
1928 if (error != 0)
1929 goto failed;
1930 mb->m_len = length;
1931 mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs;
1932 if (flags & M_PKTHDR)
1933 m->m_pkthdr.len += length;
1934 }
1935 return (m);
1936
1937 failed:
1938 m_freem(m);
1939 return (NULL);
1940 }
1941
1942 /*
1943 * Copy the contents of uio into a properly sized mbuf chain.
1944 * A compat KPI. Users are recommended to use direct calls to backing
1945 * functions.
1946 */
1947 struct mbuf *
m_uiotombuf(struct uio * uio,int how,int len,int lspace,int flags)1948 m_uiotombuf(struct uio *uio, int how, int len, int lspace, int flags)
1949 {
1950
1951 if (flags & M_EXTPG) {
1952 /* XXX: 'lspace' magically becomes maxseg! */
1953 return (m_uiotombuf_nomap(uio, how, len, lspace, flags));
1954 } else if (__predict_false(uio->uio_resid == 0)) {
1955 struct mbuf *m;
1956
1957 /*
1958 * m_uiotombuf() is known to return zero length buffer, keep
1959 * this compatibility. mc_uiotomc() won't do that.
1960 */
1961 if (flags & M_PKTHDR) {
1962 m = m_gethdr(how, MT_DATA);
1963 m->m_pkthdr.memlen = MSIZE;
1964 } else
1965 m = m_get(how, MT_DATA);
1966 if (m != NULL)
1967 m->m_data += lspace;
1968 return (m);
1969 } else {
1970 struct mchain mc;
1971 int error;
1972
1973 error = mc_uiotomc(&mc, uio, len, lspace, how, flags);
1974 if (__predict_true(error == 0)) {
1975 if (flags & M_PKTHDR) {
1976 mc_first(&mc)->m_pkthdr.len = mc.mc_len;
1977 mc_first(&mc)->m_pkthdr.memlen = mc.mc_mlen;
1978 }
1979 return (mc_first(&mc));
1980 } else
1981 return (NULL);
1982 }
1983 }
1984
1985 /*
1986 * Copy the contents of uio into a properly sized mbuf chain.
1987 * In case of failure state of mchain is inconsistent.
1988 * @param length Limit copyout length. If 0 entire uio_resid is copied.
1989 * @param lspace Provide leading space in the first mbuf in the chain.
1990 */
1991 int
mc_uiotomc(struct mchain * mc,struct uio * uio,u_int length,u_int lspace,int how,int flags)1992 mc_uiotomc(struct mchain *mc, struct uio *uio, u_int length, u_int lspace,
1993 int how, int flags)
1994 {
1995 struct mbuf *mb;
1996 u_int total;
1997 int error;
1998
1999 MPASS(lspace < MHLEN);
2000 MPASS(UINT_MAX - lspace >= length);
2001 MPASS(uio->uio_rw == UIO_WRITE);
2002 MPASS(uio->uio_resid >= 0);
2003
2004 if (length > 0) {
2005 if (uio->uio_resid > length) {
2006 total = length;
2007 flags &= ~M_EOR;
2008 } else
2009 total = uio->uio_resid;
2010 } else if (__predict_false(uio->uio_resid + lspace > UINT_MAX))
2011 return (EOVERFLOW);
2012 else
2013 total = uio->uio_resid;
2014
2015 if (__predict_false(total + lspace == 0)) {
2016 *mc = MCHAIN_INITIALIZER(mc);
2017 return (0);
2018 }
2019
2020 error = mc_get(mc, total + lspace, how, MT_DATA, flags);
2021 if (__predict_false(error))
2022 return (error);
2023 mc_first(mc)->m_data += lspace;
2024
2025 /* Fill all mbufs with uio data and update header information. */
2026 STAILQ_FOREACH(mb, &mc->mc_q, m_stailq) {
2027 u_int mlen;
2028
2029 mlen = min(M_TRAILINGSPACE(mb), total - mc->mc_len);
2030 error = uiomove(mtod(mb, void *), mlen, uio);
2031 if (__predict_false(error)) {
2032 mc_freem(mc);
2033 return (error);
2034 }
2035 mb->m_len = mlen;
2036 mc->mc_len += mlen;
2037 }
2038 MPASS(mc->mc_len == total);
2039
2040 return (0);
2041 }
2042
2043 /*
2044 * Copy data to/from an unmapped mbuf into a uio limited by len if set.
2045 */
2046 int
m_unmapped_uiomove(const struct mbuf * m,int m_off,struct uio * uio,int len)2047 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len)
2048 {
2049 vm_page_t pg;
2050 int error, i, off, pglen, pgoff, seglen, segoff;
2051
2052 M_ASSERTEXTPG(m);
2053 error = 0;
2054
2055 /* Skip over any data removed from the front. */
2056 off = mtod(m, vm_offset_t);
2057
2058 off += m_off;
2059 if (m->m_epg_hdrlen != 0) {
2060 if (off >= m->m_epg_hdrlen) {
2061 off -= m->m_epg_hdrlen;
2062 } else {
2063 seglen = m->m_epg_hdrlen - off;
2064 segoff = off;
2065 seglen = min(seglen, len);
2066 off = 0;
2067 len -= seglen;
2068 error = uiomove(__DECONST(void *,
2069 &m->m_epg_hdr[segoff]), seglen, uio);
2070 }
2071 }
2072 pgoff = m->m_epg_1st_off;
2073 for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
2074 pglen = m_epg_pagelen(m, i, pgoff);
2075 if (off >= pglen) {
2076 off -= pglen;
2077 pgoff = 0;
2078 continue;
2079 }
2080 seglen = pglen - off;
2081 segoff = pgoff + off;
2082 off = 0;
2083 seglen = min(seglen, len);
2084 len -= seglen;
2085 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
2086 error = uiomove_fromphys(&pg, segoff, seglen, uio);
2087 pgoff = 0;
2088 };
2089 if (len != 0 && error == 0) {
2090 KASSERT((off + len) <= m->m_epg_trllen,
2091 ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
2092 m->m_epg_trllen, m_off));
2093 error = uiomove(__DECONST(void *, &m->m_epg_trail[off]),
2094 len, uio);
2095 }
2096 return (error);
2097 }
2098
2099 /*
2100 * Copy an mbuf chain into a uio limited by len if set.
2101 */
2102 int
m_mbuftouio(struct uio * uio,const struct mbuf * m,int len)2103 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
2104 {
2105 int error, length, total;
2106 int progress = 0;
2107
2108 if (len > 0)
2109 total = min(uio->uio_resid, len);
2110 else
2111 total = uio->uio_resid;
2112
2113 /* Fill the uio with data from the mbufs. */
2114 for (; m != NULL; m = m->m_next) {
2115 length = min(m->m_len, total - progress);
2116
2117 if ((m->m_flags & M_EXTPG) != 0)
2118 error = m_unmapped_uiomove(m, 0, uio, length);
2119 else
2120 error = uiomove(mtod(m, void *), length, uio);
2121 if (error)
2122 return (error);
2123
2124 progress += length;
2125 }
2126
2127 return (0);
2128 }
2129
2130 /*
2131 * Create a writable copy of the mbuf chain. While doing this
2132 * we compact the chain with a goal of producing a chain with
2133 * at most two mbufs. The second mbuf in this chain is likely
2134 * to be a cluster. The primary purpose of this work is to create
2135 * a writable packet for encryption, compression, etc. The
2136 * secondary goal is to linearize the data so the data can be
2137 * passed to crypto hardware in the most efficient manner possible.
2138 */
2139 struct mbuf *
m_unshare(struct mbuf * m0,int how)2140 m_unshare(struct mbuf *m0, int how)
2141 {
2142 struct mbuf *m, *mprev;
2143 struct mbuf *n, *mfirst, *mlast;
2144 int len, off;
2145
2146 mprev = NULL;
2147 for (m = m0; m != NULL; m = mprev->m_next) {
2148 /*
2149 * Regular mbufs are ignored unless there's a cluster
2150 * in front of it that we can use to coalesce. We do
2151 * the latter mainly so later clusters can be coalesced
2152 * also w/o having to handle them specially (i.e. convert
2153 * mbuf+cluster -> cluster). This optimization is heavily
2154 * influenced by the assumption that we're running over
2155 * Ethernet where MCLBYTES is large enough that the max
2156 * packet size will permit lots of coalescing into a
2157 * single cluster. This in turn permits efficient
2158 * crypto operations, especially when using hardware.
2159 */
2160 if ((m->m_flags & M_EXT) == 0) {
2161 if (mprev && (mprev->m_flags & M_EXT) &&
2162 m->m_len <= M_TRAILINGSPACE(mprev)) {
2163 /* XXX: this ignores mbuf types */
2164 memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2165 mtod(m, caddr_t), m->m_len);
2166 mprev->m_len += m->m_len;
2167 mprev->m_next = m->m_next; /* unlink from chain */
2168 m_free(m); /* reclaim mbuf */
2169 } else {
2170 mprev = m;
2171 }
2172 continue;
2173 }
2174 /*
2175 * Writable mbufs are left alone (for now).
2176 */
2177 if (M_WRITABLE(m)) {
2178 mprev = m;
2179 continue;
2180 }
2181
2182 /*
2183 * Not writable, replace with a copy or coalesce with
2184 * the previous mbuf if possible (since we have to copy
2185 * it anyway, we try to reduce the number of mbufs and
2186 * clusters so that future work is easier).
2187 */
2188 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
2189 /* NB: we only coalesce into a cluster or larger */
2190 if (mprev != NULL && (mprev->m_flags & M_EXT) &&
2191 m->m_len <= M_TRAILINGSPACE(mprev)) {
2192 /* XXX: this ignores mbuf types */
2193 memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2194 mtod(m, caddr_t), m->m_len);
2195 mprev->m_len += m->m_len;
2196 mprev->m_next = m->m_next; /* unlink from chain */
2197 m_free(m); /* reclaim mbuf */
2198 continue;
2199 }
2200
2201 /*
2202 * Allocate new space to hold the copy and copy the data.
2203 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
2204 * splitting them into clusters. We could just malloc a
2205 * buffer and make it external but too many device drivers
2206 * don't know how to break up the non-contiguous memory when
2207 * doing DMA.
2208 */
2209 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2210 if (n == NULL) {
2211 m_freem(m0);
2212 return (NULL);
2213 }
2214 if (m->m_flags & M_PKTHDR) {
2215 KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
2216 __func__, m0, m));
2217 m_move_pkthdr(n, m);
2218 }
2219 len = m->m_len;
2220 off = 0;
2221 mfirst = n;
2222 mlast = NULL;
2223 for (;;) {
2224 int cc = min(len, MCLBYTES);
2225 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
2226 n->m_len = cc;
2227 if (mlast != NULL)
2228 mlast->m_next = n;
2229 mlast = n;
2230 #if 0
2231 newipsecstat.ips_clcopied++;
2232 #endif
2233
2234 len -= cc;
2235 if (len <= 0)
2236 break;
2237 off += cc;
2238
2239 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2240 if (n == NULL) {
2241 m_freem(mfirst);
2242 m_freem(m0);
2243 return (NULL);
2244 }
2245 }
2246 n->m_next = m->m_next;
2247 if (mprev == NULL)
2248 m0 = mfirst; /* new head of chain */
2249 else
2250 mprev->m_next = mfirst; /* replace old mbuf */
2251 m_free(m); /* release old mbuf */
2252 mprev = mfirst;
2253 }
2254 return (m0);
2255 }
2256
2257 #ifdef MBUF_PROFILING
2258
2259 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
2260 struct mbufprofile {
2261 uintmax_t wasted[MP_BUCKETS];
2262 uintmax_t used[MP_BUCKETS];
2263 uintmax_t segments[MP_BUCKETS];
2264 } mbprof;
2265
2266 void
m_profile(struct mbuf * m)2267 m_profile(struct mbuf *m)
2268 {
2269 int segments = 0;
2270 int used = 0;
2271 int wasted = 0;
2272
2273 while (m) {
2274 segments++;
2275 used += m->m_len;
2276 if (m->m_flags & M_EXT) {
2277 wasted += MHLEN - sizeof(m->m_ext) +
2278 m->m_ext.ext_size - m->m_len;
2279 } else {
2280 if (m->m_flags & M_PKTHDR)
2281 wasted += MHLEN - m->m_len;
2282 else
2283 wasted += MLEN - m->m_len;
2284 }
2285 m = m->m_next;
2286 }
2287 /* be paranoid.. it helps */
2288 if (segments > MP_BUCKETS - 1)
2289 segments = MP_BUCKETS - 1;
2290 if (used > 100000)
2291 used = 100000;
2292 if (wasted > 100000)
2293 wasted = 100000;
2294 /* store in the appropriate bucket */
2295 /* don't bother locking. if it's slightly off, so what? */
2296 mbprof.segments[segments]++;
2297 mbprof.used[fls(used)]++;
2298 mbprof.wasted[fls(wasted)]++;
2299 }
2300
2301 static int
mbprof_handler(SYSCTL_HANDLER_ARGS)2302 mbprof_handler(SYSCTL_HANDLER_ARGS)
2303 {
2304 char buf[256];
2305 struct sbuf sb;
2306 int error;
2307 uint64_t *p;
2308
2309 sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req);
2310
2311 p = &mbprof.wasted[0];
2312 sbuf_printf(&sb,
2313 "wasted:\n"
2314 "%ju %ju %ju %ju %ju %ju %ju %ju "
2315 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2316 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2317 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2318 #ifdef BIG_ARRAY
2319 p = &mbprof.wasted[16];
2320 sbuf_printf(&sb,
2321 "%ju %ju %ju %ju %ju %ju %ju %ju "
2322 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2323 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2324 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2325 #endif
2326 p = &mbprof.used[0];
2327 sbuf_printf(&sb,
2328 "used:\n"
2329 "%ju %ju %ju %ju %ju %ju %ju %ju "
2330 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2331 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2332 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2333 #ifdef BIG_ARRAY
2334 p = &mbprof.used[16];
2335 sbuf_printf(&sb,
2336 "%ju %ju %ju %ju %ju %ju %ju %ju "
2337 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2338 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2339 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2340 #endif
2341 p = &mbprof.segments[0];
2342 sbuf_printf(&sb,
2343 "segments:\n"
2344 "%ju %ju %ju %ju %ju %ju %ju %ju "
2345 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2346 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2347 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2348 #ifdef BIG_ARRAY
2349 p = &mbprof.segments[16];
2350 sbuf_printf(&sb,
2351 "%ju %ju %ju %ju %ju %ju %ju %ju "
2352 "%ju %ju %ju %ju %ju %ju %ju %jju",
2353 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2354 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2355 #endif
2356
2357 error = sbuf_finish(&sb);
2358 sbuf_delete(&sb);
2359 return (error);
2360 }
2361
2362 static int
mbprof_clr_handler(SYSCTL_HANDLER_ARGS)2363 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
2364 {
2365 int clear, error;
2366
2367 clear = 0;
2368 error = sysctl_handle_int(oidp, &clear, 0, req);
2369 if (error || !req->newptr)
2370 return (error);
2371
2372 if (clear) {
2373 bzero(&mbprof, sizeof(mbprof));
2374 }
2375
2376 return (error);
2377 }
2378
2379 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile,
2380 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2381 mbprof_handler, "A",
2382 "mbuf profiling statistics");
2383
2384 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr,
2385 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
2386 mbprof_clr_handler, "I",
2387 "clear mbuf profiling statistics");
2388 #endif
2389