xref: /freebsd/sys/netinet/ip_reass.c (revision 64981536)
1 /*-
2  * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
3  * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
4  * Copyright (c) 1982, 1986, 1988, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_rss.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/eventhandler.h>
42 #include <sys/kernel.h>
43 #include <sys/hash.h>
44 #include <sys/mbuf.h>
45 #include <sys/malloc.h>
46 #include <sys/limits.h>
47 #include <sys/lock.h>
48 #include <sys/mutex.h>
49 #include <sys/sysctl.h>
50 #include <sys/socket.h>
51 
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <net/rss_config.h>
55 #include <net/netisr.h>
56 #include <net/vnet.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/ip_var.h>
61 #include <netinet/in_rss.h>
62 #ifdef MAC
63 #include <security/mac/mac_framework.h>
64 #endif
65 
66 SYSCTL_DECL(_net_inet_ip);
67 
68 /*
69  * Reassembly headers are stored in hash buckets.
70  */
71 #define	IPREASS_NHASH_LOG2	10
72 #define	IPREASS_NHASH		(1 << IPREASS_NHASH_LOG2)
73 #define	IPREASS_HMASK		(IPREASS_NHASH - 1)
74 
75 struct ipqbucket {
76 	TAILQ_HEAD(ipqhead, ipq) head;
77 	struct mtx		 lock;
78 	int			 count;
79 };
80 
81 VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]);
82 #define	V_ipq		VNET(ipq)
83 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed);
84 #define V_ipq_hashseed   VNET(ipq_hashseed)
85 
86 #define	IPQ_LOCK(i)	mtx_lock(&V_ipq[i].lock)
87 #define	IPQ_TRYLOCK(i)	mtx_trylock(&V_ipq[i].lock)
88 #define	IPQ_UNLOCK(i)	mtx_unlock(&V_ipq[i].lock)
89 #define	IPQ_LOCK_ASSERT(i)	mtx_assert(&V_ipq[i].lock, MA_OWNED)
90 
91 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
92 #define	V_ipreass_maxbucketsize	VNET(ipreass_maxbucketsize)
93 
94 void		ipreass_init(void);
95 void		ipreass_vnet_init(void);
96 #ifdef VIMAGE
97 void		ipreass_destroy(void);
98 #endif
99 static int	sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
100 static int	sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
101 static void	ipreass_zone_change(void *);
102 static void	ipreass_drain_tomax(void);
103 static void	ipq_free(struct ipqbucket *, struct ipq *);
104 static struct ipq * ipq_reuse(int);
105 
106 static inline void
107 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
108 {
109 
110 	IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
111 	ipq_free(bucket, fp);
112 }
113 
114 static inline void
115 ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
116 {
117 
118 	IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
119 	ipq_free(bucket, fp);
120 }
121 
122 /*
123  * By default, limit the number of IP fragments across all reassembly
124  * queues to  1/32 of the total number of mbuf clusters.
125  *
126  * Limit the total number of reassembly queues per VNET to the
127  * IP fragment limit, but ensure the limit will not allow any bucket
128  * to grow above 100 items. (The bucket limit is
129  * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
130  * multiplier to reach a 100-item limit.)
131  * The 100-item limit was chosen as brief testing seems to show that
132  * this produces "reasonable" performance on some subset of systems
133  * under DoS attack.
134  */
135 #define	IP_MAXFRAGS		(nmbclusters / 32)
136 #define	IP_MAXFRAGPACKETS	(imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
137 
138 static int		maxfrags;
139 static u_int __exclusive_cache_line	nfrags;
140 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
141     &maxfrags, 0,
142     "Maximum number of IPv4 fragments allowed across all reassembly queues");
143 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
144     &nfrags, 0,
145     "Current number of IPv4 fragments across all reassembly queues");
146 
147 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone);
148 #define	V_ipq_zone	VNET(ipq_zone)
149 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
150     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
151     NULL, 0, sysctl_maxfragpackets, "I",
152     "Maximum number of IPv4 fragment reassembly queue entries");
153 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET,
154     &VNET_NAME(ipq_zone),
155     "Current number of IPv4 fragment reassembly queue entries");
156 
157 VNET_DEFINE_STATIC(int, noreass);
158 #define	V_noreass	VNET(noreass)
159 
160 VNET_DEFINE_STATIC(int, maxfragsperpacket);
161 #define	V_maxfragsperpacket	VNET(maxfragsperpacket)
162 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
163     &VNET_NAME(maxfragsperpacket), 0,
164     "Maximum number of IPv4 fragments allowed per packet");
165 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
166     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
167     sysctl_maxfragbucketsize, "I",
168     "Maximum number of IPv4 fragment reassembly queue entries per bucket");
169 
170 static u_int ipfragttl = IPFRAGTTL / 2;
171 SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragttl, CTLFLAG_RD, &ipfragttl,
172     IPFRAGTTL / 2, "IP fragment life time on reassembly queue");
173 
174 /*
175  * Take incoming datagram fragment and try to reassemble it into
176  * whole datagram.  If the argument is the first fragment or one
177  * in between the function will return NULL and store the mbuf
178  * in the fragment chain.  If the argument is the last fragment
179  * the packet will be reassembled and the pointer to the new
180  * mbuf returned for further processing.  Only m_tags attached
181  * to the first packet/fragment are preserved.
182  * The IP header is *NOT* adjusted out of iplen.
183  */
184 #define	M_IP_FRAG	M_PROTO9
185 struct mbuf *
186 ip_reass(struct mbuf *m)
187 {
188 	struct ip *ip;
189 	struct mbuf *p, *q, *nq, *t;
190 	struct ipq *fp;
191 	struct ifnet *srcifp;
192 	struct ipqhead *head;
193 	int i, hlen, next, tmpmax;
194 	u_int8_t ecn, ecn0;
195 	uint32_t hash, hashkey[3];
196 #ifdef	RSS
197 	uint32_t rss_hash, rss_type;
198 #endif
199 
200 	/*
201 	 * If no reassembling or maxfragsperpacket are 0,
202 	 * never accept fragments.
203 	 * Also, drop packet if it would exceed the maximum
204 	 * number of fragments.
205 	 */
206 	tmpmax = maxfrags;
207 	if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
208 	    (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) {
209 		IPSTAT_INC(ips_fragments);
210 		IPSTAT_INC(ips_fragdropped);
211 		m_freem(m);
212 		return (NULL);
213 	}
214 
215 	ip = mtod(m, struct ip *);
216 	hlen = ip->ip_hl << 2;
217 
218 	/*
219 	 * Adjust ip_len to not reflect header,
220 	 * convert offset of this to bytes.
221 	 */
222 	ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
223 	/*
224 	 * Make sure that fragments have a data length
225 	 * that's a non-zero multiple of 8 bytes, unless
226 	 * this is the last fragment.
227 	 */
228 	if (ip->ip_len == htons(0) ||
229 	    ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) {
230 		IPSTAT_INC(ips_toosmall); /* XXX */
231 		IPSTAT_INC(ips_fragdropped);
232 		m_freem(m);
233 		return (NULL);
234 	}
235 	if (ip->ip_off & htons(IP_MF))
236 		m->m_flags |= M_IP_FRAG;
237 	else
238 		m->m_flags &= ~M_IP_FRAG;
239 	ip->ip_off = htons(ntohs(ip->ip_off) << 3);
240 
241 	/*
242 	 * Make sure the fragment lies within a packet of valid size.
243 	 */
244 	if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) {
245 		IPSTAT_INC(ips_toolong);
246 		IPSTAT_INC(ips_fragdropped);
247 		m_freem(m);
248 		return (NULL);
249 	}
250 
251 	/*
252 	 * Store receive network interface pointer for later.
253 	 */
254 	srcifp = m->m_pkthdr.rcvif;
255 
256 	/*
257 	 * Attempt reassembly; if it succeeds, proceed.
258 	 * ip_reass() will return a different mbuf.
259 	 */
260 	IPSTAT_INC(ips_fragments);
261 	m->m_pkthdr.PH_loc.ptr = ip;
262 
263 	/*
264 	 * Presence of header sizes in mbufs
265 	 * would confuse code below.
266 	 */
267 	m->m_data += hlen;
268 	m->m_len -= hlen;
269 
270 	hashkey[0] = ip->ip_src.s_addr;
271 	hashkey[1] = ip->ip_dst.s_addr;
272 	hashkey[2] = (uint32_t)ip->ip_p << 16;
273 	hashkey[2] += ip->ip_id;
274 	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
275 	hash &= IPREASS_HMASK;
276 	head = &V_ipq[hash].head;
277 	IPQ_LOCK(hash);
278 
279 	/*
280 	 * Look for queue of fragments
281 	 * of this datagram.
282 	 */
283 	TAILQ_FOREACH(fp, head, ipq_list)
284 		if (ip->ip_id == fp->ipq_id &&
285 		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
286 		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
287 #ifdef MAC
288 		    mac_ipq_match(m, fp) &&
289 #endif
290 		    ip->ip_p == fp->ipq_p)
291 			break;
292 	/*
293 	 * If first fragment to arrive, create a reassembly queue.
294 	 */
295 	if (fp == NULL) {
296 		if (V_ipq[hash].count < V_ipreass_maxbucketsize)
297 			fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
298 		if (fp == NULL)
299 			fp = ipq_reuse(hash);
300 		if (fp == NULL)
301 			goto dropfrag;
302 #ifdef MAC
303 		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
304 			uma_zfree(V_ipq_zone, fp);
305 			fp = NULL;
306 			goto dropfrag;
307 		}
308 		mac_ipq_create(m, fp);
309 #endif
310 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
311 		V_ipq[hash].count++;
312 		fp->ipq_nfrags = 1;
313 		atomic_add_int(&nfrags, 1);
314 		fp->ipq_ttl = IPFRAGTTL;
315 		fp->ipq_p = ip->ip_p;
316 		fp->ipq_id = ip->ip_id;
317 		fp->ipq_src = ip->ip_src;
318 		fp->ipq_dst = ip->ip_dst;
319 		fp->ipq_frags = m;
320 		if (m->m_flags & M_IP_FRAG)
321 			fp->ipq_maxoff = -1;
322 		else
323 			fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
324 		m->m_nextpkt = NULL;
325 		goto done;
326 	} else {
327 		/*
328 		 * If we already saw the last fragment, make sure
329 		 * this fragment's offset looks sane. Otherwise, if
330 		 * this is the last fragment, record its endpoint.
331 		 */
332 		if (fp->ipq_maxoff > 0) {
333 			i = ntohs(ip->ip_off) + ntohs(ip->ip_len);
334 			if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) ||
335 			    ((m->m_flags & M_IP_FRAG) == 0 &&
336 			    i != fp->ipq_maxoff)) {
337 				fp = NULL;
338 				goto dropfrag;
339 			}
340 		} else if ((m->m_flags & M_IP_FRAG) == 0)
341 			fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
342 		fp->ipq_nfrags++;
343 		atomic_add_int(&nfrags, 1);
344 #ifdef MAC
345 		mac_ipq_update(m, fp);
346 #endif
347 	}
348 
349 #define GETIP(m)	((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
350 
351 	/*
352 	 * Handle ECN by comparing this segment with the first one;
353 	 * if CE is set, do not lose CE.
354 	 * drop if CE and not-ECT are mixed for the same packet.
355 	 */
356 	ecn = ip->ip_tos & IPTOS_ECN_MASK;
357 	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
358 	if (ecn == IPTOS_ECN_CE) {
359 		if (ecn0 == IPTOS_ECN_NOTECT)
360 			goto dropfrag;
361 		if (ecn0 != IPTOS_ECN_CE)
362 			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
363 	}
364 	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
365 		goto dropfrag;
366 
367 	/*
368 	 * Find a segment which begins after this one does.
369 	 */
370 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
371 		if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
372 			break;
373 
374 	/*
375 	 * If there is a preceding segment, it may provide some of
376 	 * our data already.  If so, drop the data from the incoming
377 	 * segment.  If it provides all of our data, drop us, otherwise
378 	 * stick new segment in the proper place.
379 	 *
380 	 * If some of the data is dropped from the preceding
381 	 * segment, then it's checksum is invalidated.
382 	 */
383 	if (p) {
384 		i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
385 		    ntohs(ip->ip_off);
386 		if (i > 0) {
387 			if (i >= ntohs(ip->ip_len))
388 				goto dropfrag;
389 			m_adj(m, i);
390 			m->m_pkthdr.csum_flags = 0;
391 			ip->ip_off = htons(ntohs(ip->ip_off) + i);
392 			ip->ip_len = htons(ntohs(ip->ip_len) - i);
393 		}
394 		m->m_nextpkt = p->m_nextpkt;
395 		p->m_nextpkt = m;
396 	} else {
397 		m->m_nextpkt = fp->ipq_frags;
398 		fp->ipq_frags = m;
399 	}
400 
401 	/*
402 	 * While we overlap succeeding segments trim them or,
403 	 * if they are completely covered, dequeue them.
404 	 */
405 	for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
406 	    ntohs(GETIP(q)->ip_off); q = nq) {
407 		i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
408 		    ntohs(GETIP(q)->ip_off);
409 		if (i < ntohs(GETIP(q)->ip_len)) {
410 			GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
411 			GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
412 			m_adj(q, i);
413 			q->m_pkthdr.csum_flags = 0;
414 			break;
415 		}
416 		nq = q->m_nextpkt;
417 		m->m_nextpkt = nq;
418 		IPSTAT_INC(ips_fragdropped);
419 		fp->ipq_nfrags--;
420 		atomic_subtract_int(&nfrags, 1);
421 		m_freem(q);
422 	}
423 
424 	/*
425 	 * Check for complete reassembly and perform frag per packet
426 	 * limiting.
427 	 *
428 	 * Frag limiting is performed here so that the nth frag has
429 	 * a chance to complete the packet before we drop the packet.
430 	 * As a result, n+1 frags are actually allowed per packet, but
431 	 * only n will ever be stored. (n = maxfragsperpacket.)
432 	 *
433 	 */
434 	next = 0;
435 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
436 		if (ntohs(GETIP(q)->ip_off) != next) {
437 			if (fp->ipq_nfrags > V_maxfragsperpacket)
438 				ipq_drop(&V_ipq[hash], fp);
439 			goto done;
440 		}
441 		next += ntohs(GETIP(q)->ip_len);
442 	}
443 	/* Make sure the last packet didn't have the IP_MF flag */
444 	if (p->m_flags & M_IP_FRAG) {
445 		if (fp->ipq_nfrags > V_maxfragsperpacket)
446 			ipq_drop(&V_ipq[hash], fp);
447 		goto done;
448 	}
449 
450 	/*
451 	 * Reassembly is complete.  Make sure the packet is a sane size.
452 	 */
453 	q = fp->ipq_frags;
454 	ip = GETIP(q);
455 	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
456 		IPSTAT_INC(ips_toolong);
457 		ipq_drop(&V_ipq[hash], fp);
458 		goto done;
459 	}
460 
461 	/*
462 	 * Concatenate fragments.
463 	 */
464 	m = q;
465 	t = m->m_next;
466 	m->m_next = NULL;
467 	m_cat(m, t);
468 	nq = q->m_nextpkt;
469 	q->m_nextpkt = NULL;
470 	for (q = nq; q != NULL; q = nq) {
471 		nq = q->m_nextpkt;
472 		q->m_nextpkt = NULL;
473 		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
474 		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
475 		m_demote_pkthdr(q);
476 		m_cat(m, q);
477 	}
478 	/*
479 	 * In order to do checksumming faster we do 'end-around carry' here
480 	 * (and not in for{} loop), though it implies we are not going to
481 	 * reassemble more than 64k fragments.
482 	 */
483 	while (m->m_pkthdr.csum_data & 0xffff0000)
484 		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
485 		    (m->m_pkthdr.csum_data >> 16);
486 	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
487 #ifdef MAC
488 	mac_ipq_reassemble(fp, m);
489 	mac_ipq_destroy(fp);
490 #endif
491 
492 	/*
493 	 * Create header for new ip packet by modifying header of first
494 	 * packet;  dequeue and discard fragment reassembly header.
495 	 * Make header visible.
496 	 */
497 	ip->ip_len = htons((ip->ip_hl << 2) + next);
498 	ip->ip_src = fp->ipq_src;
499 	ip->ip_dst = fp->ipq_dst;
500 	TAILQ_REMOVE(head, fp, ipq_list);
501 	V_ipq[hash].count--;
502 	uma_zfree(V_ipq_zone, fp);
503 	m->m_len += (ip->ip_hl << 2);
504 	m->m_data -= (ip->ip_hl << 2);
505 	/* some debugging cruft by sklower, below, will go away soon */
506 	if (m->m_flags & M_PKTHDR) {	/* XXX this should be done elsewhere */
507 		m_fixhdr(m);
508 		/* set valid receive interface pointer */
509 		m->m_pkthdr.rcvif = srcifp;
510 	}
511 	IPSTAT_INC(ips_reassembled);
512 	IPQ_UNLOCK(hash);
513 
514 #ifdef	RSS
515 	/*
516 	 * Query the RSS layer for the flowid / flowtype for the
517 	 * mbuf payload.
518 	 *
519 	 * For now, just assume we have to calculate a new one.
520 	 * Later on we should check to see if the assigned flowid matches
521 	 * what RSS wants for the given IP protocol and if so, just keep it.
522 	 *
523 	 * We then queue into the relevant netisr so it can be dispatched
524 	 * to the correct CPU.
525 	 *
526 	 * Note - this may return 1, which means the flowid in the mbuf
527 	 * is correct for the configured RSS hash types and can be used.
528 	 */
529 	if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
530 		m->m_pkthdr.flowid = rss_hash;
531 		M_HASHTYPE_SET(m, rss_type);
532 	}
533 
534 	/*
535 	 * Queue/dispatch for reprocessing.
536 	 *
537 	 * Note: this is much slower than just handling the frame in the
538 	 * current receive context.  It's likely worth investigating
539 	 * why this is.
540 	 */
541 	netisr_dispatch(NETISR_IP_DIRECT, m);
542 	return (NULL);
543 #endif
544 
545 	/* Handle in-line */
546 	return (m);
547 
548 dropfrag:
549 	IPSTAT_INC(ips_fragdropped);
550 	if (fp != NULL) {
551 		fp->ipq_nfrags--;
552 		atomic_subtract_int(&nfrags, 1);
553 	}
554 	m_freem(m);
555 done:
556 	IPQ_UNLOCK(hash);
557 	return (NULL);
558 
559 #undef GETIP
560 }
561 
562 /*
563  * If a timer expires on a reassembly queue, discard it.
564  */
565 static struct callout ipreass_callout;
566 static void
567 ipreass_slowtimo(void *arg __unused)
568 {
569 	VNET_ITERATOR_DECL(vnet_iter);
570 	struct ipq *fp, *tmp;
571 
572 	if (atomic_load_int(&nfrags) == 0)
573 		return;
574 
575 	VNET_FOREACH(vnet_iter) {
576 		CURVNET_SET(vnet_iter);
577 		for (int i = 0; i < IPREASS_NHASH; i++) {
578 			if (TAILQ_EMPTY(&V_ipq[i].head))
579 				continue;
580 			IPQ_LOCK(i);
581 			TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
582 			if (--fp->ipq_ttl == 0)
583 				ipq_timeout(&V_ipq[i], fp);
584 			IPQ_UNLOCK(i);
585 		}
586 		CURVNET_RESTORE();
587 	}
588 	VNET_LIST_RUNLOCK_NOSLEEP();
589 
590 	callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10,
591 	    ipreass_slowtimo, NULL, 0);
592 }
593 
594 static void
595 ipreass_timer_init(void *arg __unused)
596 {
597 
598 	callout_init(&ipreass_callout, 1);
599 	callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10,
600 	    ipreass_slowtimo, NULL, 0);
601 }
602 
603 static void
604 ipreass_drain_vnet(void)
605 {
606 
607 	for (int i = 0; i < IPREASS_NHASH; i++) {
608 		IPQ_LOCK(i);
609 		while(!TAILQ_EMPTY(&V_ipq[i].head))
610 			ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
611 		KASSERT(V_ipq[i].count == 0,
612 		    ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
613 		    V_ipq[i].count, V_ipq));
614 		IPQ_UNLOCK(i);
615 	}
616 }
617 SYSINIT(ipreass, SI_SUB_VNET_DONE, SI_ORDER_ANY, ipreass_timer_init, NULL);
618 
619 /*
620  * Drain off all datagram fragments.
621  */
622 static void
623 ipreass_drain(void)
624 {
625 	VNET_ITERATOR_DECL(vnet_iter);
626 
627 	VNET_FOREACH(vnet_iter) {
628 		CURVNET_SET(vnet_iter);
629 		ipreass_drain_vnet();
630 		CURVNET_RESTORE();
631 	}
632 }
633 
634 
635 /*
636  * Initialize IP reassembly structures.
637  */
638 void
639 ipreass_vnet_init(void)
640 {
641 	int max;
642 
643 	for (int i = 0; i < IPREASS_NHASH; i++) {
644 		TAILQ_INIT(&V_ipq[i].head);
645 		mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
646 		    MTX_DEF | MTX_DUPOK);
647 		V_ipq[i].count = 0;
648 	}
649 	V_ipq_hashseed = arc4random();
650 	V_maxfragsperpacket = 16;
651 	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
652 	    NULL, UMA_ALIGN_PTR, 0);
653 	max = IP_MAXFRAGPACKETS;
654 	max = uma_zone_set_max(V_ipq_zone, max);
655 	V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
656 }
657 
658 void
659 ipreass_init(void)
660 {
661 
662 	maxfrags = IP_MAXFRAGS;
663 	EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
664 	    NULL, EVENTHANDLER_PRI_ANY);
665 	EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL,
666 	    LOWMEM_PRI_DEFAULT);
667 	EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL,
668 		LOWMEM_PRI_DEFAULT);
669 }
670 
671 /*
672  * Drain off all datagram fragments belonging to
673  * the given network interface.
674  */
675 static void
676 ipreass_cleanup(void *arg __unused, struct ifnet *ifp)
677 {
678 	struct ipq *fp, *temp;
679 	struct mbuf *m;
680 	int i;
681 
682 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
683 
684 	CURVNET_SET_QUIET(ifp->if_vnet);
685 
686 	/*
687 	 * Skip processing if IPv4 reassembly is not initialised or
688 	 * torn down by ipreass_destroy().
689 	 */
690 	if (V_ipq_zone == NULL) {
691 		CURVNET_RESTORE();
692 		return;
693 	}
694 
695 	for (i = 0; i < IPREASS_NHASH; i++) {
696 		IPQ_LOCK(i);
697 		/* Scan fragment list. */
698 		TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) {
699 			for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) {
700 				/* clear no longer valid rcvif pointer */
701 				if (m->m_pkthdr.rcvif == ifp)
702 					m->m_pkthdr.rcvif = NULL;
703 			}
704 		}
705 		IPQ_UNLOCK(i);
706 	}
707 	CURVNET_RESTORE();
708 }
709 EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0);
710 
711 #ifdef VIMAGE
712 /*
713  * Destroy IP reassembly structures.
714  */
715 void
716 ipreass_destroy(void)
717 {
718 
719 	ipreass_drain_vnet();
720 	uma_zdestroy(V_ipq_zone);
721 	V_ipq_zone = NULL;
722 	for (int i = 0; i < IPREASS_NHASH; i++)
723 		mtx_destroy(&V_ipq[i].lock);
724 }
725 #endif
726 
727 /*
728  * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
729  * max has slightly different semantics than the sysctl, for historical
730  * reasons.
731  */
732 static void
733 ipreass_drain_tomax(void)
734 {
735 	struct ipq *fp;
736 	int target;
737 
738 	/*
739 	 * Make sure each bucket is under the new limit. If
740 	 * necessary, drop enough of the oldest elements from
741 	 * each bucket to get under the new limit.
742 	 */
743 	for (int i = 0; i < IPREASS_NHASH; i++) {
744 		IPQ_LOCK(i);
745 		while (V_ipq[i].count > V_ipreass_maxbucketsize &&
746 		    (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
747 			ipq_timeout(&V_ipq[i], fp);
748 		IPQ_UNLOCK(i);
749 	}
750 
751 	/*
752 	 * If we are over the maximum number of fragments,
753 	 * drain off enough to get down to the new limit,
754 	 * stripping off last elements on queues.  Every
755 	 * run we strip the oldest element from each bucket.
756 	 */
757 	target = uma_zone_get_max(V_ipq_zone);
758 	while (uma_zone_get_cur(V_ipq_zone) > target) {
759 		for (int i = 0; i < IPREASS_NHASH; i++) {
760 			IPQ_LOCK(i);
761 			fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
762 			if (fp != NULL)
763 				ipq_timeout(&V_ipq[i], fp);
764 			IPQ_UNLOCK(i);
765 		}
766 	}
767 }
768 
769 static void
770 ipreass_zone_change(void *tag)
771 {
772 	VNET_ITERATOR_DECL(vnet_iter);
773 	int max;
774 
775 	maxfrags = IP_MAXFRAGS;
776 	max = IP_MAXFRAGPACKETS;
777 	VNET_LIST_RLOCK_NOSLEEP();
778 	VNET_FOREACH(vnet_iter) {
779 		CURVNET_SET(vnet_iter);
780 		max = uma_zone_set_max(V_ipq_zone, max);
781 		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
782 		ipreass_drain_tomax();
783 		CURVNET_RESTORE();
784 	}
785 	VNET_LIST_RUNLOCK_NOSLEEP();
786 }
787 
788 /*
789  * Change the limit on the UMA zone, or disable the fragment allocation
790  * at all.  Since 0 and -1 is a special values here, we need our own handler,
791  * instead of sysctl_handle_uma_zone_max().
792  */
793 static int
794 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
795 {
796 	int error, max;
797 
798 	if (V_noreass == 0) {
799 		max = uma_zone_get_max(V_ipq_zone);
800 		if (max == 0)
801 			max = -1;
802 	} else
803 		max = 0;
804 	error = sysctl_handle_int(oidp, &max, 0, req);
805 	if (error || !req->newptr)
806 		return (error);
807 	if (max > 0) {
808 		/*
809 		 * XXXRW: Might be a good idea to sanity check the argument
810 		 * and place an extreme upper bound.
811 		 */
812 		max = uma_zone_set_max(V_ipq_zone, max);
813 		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
814 		ipreass_drain_tomax();
815 		V_noreass = 0;
816 	} else if (max == 0) {
817 		V_noreass = 1;
818 		ipreass_drain();
819 	} else if (max == -1) {
820 		V_noreass = 0;
821 		uma_zone_set_max(V_ipq_zone, 0);
822 		V_ipreass_maxbucketsize = INT_MAX;
823 	} else
824 		return (EINVAL);
825 	return (0);
826 }
827 
828 /*
829  * Seek for old fragment queue header that can be reused.  Try to
830  * reuse a header from currently locked hash bucket.
831  */
832 static struct ipq *
833 ipq_reuse(int start)
834 {
835 	struct ipq *fp;
836 	int bucket, i;
837 
838 	IPQ_LOCK_ASSERT(start);
839 
840 	for (i = 0; i < IPREASS_NHASH; i++) {
841 		bucket = (start + i) % IPREASS_NHASH;
842 		if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
843 			continue;
844 		fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
845 		if (fp) {
846 			struct mbuf *m;
847 
848 			IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
849 			atomic_subtract_int(&nfrags, fp->ipq_nfrags);
850 			while (fp->ipq_frags) {
851 				m = fp->ipq_frags;
852 				fp->ipq_frags = m->m_nextpkt;
853 				m_freem(m);
854 			}
855 			TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
856 			V_ipq[bucket].count--;
857 			if (bucket != start)
858 				IPQ_UNLOCK(bucket);
859 			break;
860 		}
861 		if (bucket != start)
862 			IPQ_UNLOCK(bucket);
863 	}
864 	IPQ_LOCK_ASSERT(start);
865 	return (fp);
866 }
867 
868 /*
869  * Free a fragment reassembly header and all associated datagrams.
870  */
871 static void
872 ipq_free(struct ipqbucket *bucket, struct ipq *fp)
873 {
874 	struct mbuf *q;
875 
876 	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
877 	while (fp->ipq_frags) {
878 		q = fp->ipq_frags;
879 		fp->ipq_frags = q->m_nextpkt;
880 		m_freem(q);
881 	}
882 	TAILQ_REMOVE(&bucket->head, fp, ipq_list);
883 	bucket->count--;
884 	uma_zfree(V_ipq_zone, fp);
885 }
886 
887 /*
888  * Get or set the maximum number of reassembly queues per bucket.
889  */
890 static int
891 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
892 {
893 	int error, max;
894 
895 	max = V_ipreass_maxbucketsize;
896 	error = sysctl_handle_int(oidp, &max, 0, req);
897 	if (error || !req->newptr)
898 		return (error);
899 	if (max <= 0)
900 		return (EINVAL);
901 	V_ipreass_maxbucketsize = max;
902 	ipreass_drain_tomax();
903 	return (0);
904 }
905