xref: /openbsd/sys/netinet/ip_icmp.c (revision 4cfece93)
1 /*	$OpenBSD: ip_icmp.c,v 1.181 2018/11/28 08:15:29 claudio Exp $	*/
2 /*	$NetBSD: ip_icmp.c,v 1.19 1996/02/13 23:42:22 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include "carp.h"
72 #include "pf.h"
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/mbuf.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/sysctl.h>
80 
81 #include <net/if.h>
82 #include <net/if_var.h>
83 #include <net/route.h>
84 
85 #include <netinet/in.h>
86 #include <netinet/in_systm.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip.h>
89 #include <netinet/ip_icmp.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/icmp_var.h>
92 
93 #if NCARP > 0
94 #include <net/if_types.h>
95 #include <netinet/ip_carp.h>
96 #endif
97 
98 #if NPF > 0
99 #include <net/pfvar.h>
100 #endif
101 
102 /*
103  * ICMP routines: error generation, receive packet processing, and
104  * routines to turnaround packets back to the originator, and
105  * host table maintenance routines.
106  */
107 
108 #ifdef ICMPPRINTFS
109 int	icmpprintfs = 0;	/* Settable from ddb */
110 #endif
111 
112 /* values controllable via sysctl */
113 int	icmpmaskrepl = 0;
114 int	icmpbmcastecho = 0;
115 int	icmptstamprepl = 1;
116 int	icmperrppslim = 100;
117 int	icmp_rediraccept = 0;
118 int	icmp_redirtimeout = 10 * 60;
119 
120 static int icmperrpps_count = 0;
121 static struct timeval icmperrppslim_last;
122 
123 static struct rttimer_queue *icmp_redirect_timeout_q = NULL;
124 struct cpumem *icmpcounters;
125 
126 int *icmpctl_vars[ICMPCTL_MAXID] = ICMPCTL_VARS;
127 
128 void icmp_mtudisc_timeout(struct rtentry *, struct rttimer *);
129 int icmp_ratelimit(const struct in_addr *, const int, const int);
130 void icmp_redirect_timeout(struct rtentry *, struct rttimer *);
131 int icmp_input_if(struct ifnet *, struct mbuf **, int *, int, int);
132 int icmp_sysctl_icmpstat(void *, size_t *, void *);
133 
134 void
135 icmp_init(void)
136 {
137 	icmpcounters = counters_alloc(icps_ncounters);
138 	/*
139 	 * This is only useful if the user initializes redirtimeout to
140 	 * something other than zero.
141 	 */
142 	if (icmp_redirtimeout != 0) {
143 		icmp_redirect_timeout_q =
144 		    rt_timer_queue_create(icmp_redirtimeout);
145 	}
146 }
147 
148 struct mbuf *
149 icmp_do_error(struct mbuf *n, int type, int code, u_int32_t dest, int destmtu)
150 {
151 	struct ip *oip = mtod(n, struct ip *), *nip;
152 	unsigned oiplen = oip->ip_hl << 2;
153 	struct icmp *icp;
154 	struct mbuf *m;
155 	unsigned icmplen, mblen;
156 
157 #ifdef ICMPPRINTFS
158 	if (icmpprintfs)
159 		printf("icmp_error(%x, %d, %d)\n", oip, type, code);
160 #endif
161 	if (type != ICMP_REDIRECT)
162 		icmpstat_inc(icps_error);
163 	/*
164 	 * Don't send error if not the first fragment of message.
165 	 * Don't error if the old packet protocol was ICMP
166 	 * error message, only known informational types.
167 	 */
168 	if (oip->ip_off & htons(IP_OFFMASK))
169 		goto freeit;
170 	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
171 	    n->m_len >= oiplen + ICMP_MINLEN &&
172 	    !ICMP_INFOTYPE(((struct icmp *)
173 	    ((caddr_t)oip + oiplen))->icmp_type)) {
174 		icmpstat_inc(icps_oldicmp);
175 		goto freeit;
176 	}
177 	/* Don't send error in response to a multicast or broadcast packet */
178 	if (n->m_flags & (M_BCAST|M_MCAST))
179 		goto freeit;
180 
181 	/*
182 	 * First, do a rate limitation check.
183 	 */
184 	if (icmp_ratelimit(&oip->ip_src, type, code)) {
185 		icmpstat_inc(icps_toofreq);
186 		goto freeit;
187 	}
188 
189 	/*
190 	 * Now, formulate icmp message
191 	 */
192 	icmplen = oiplen + min(8, ntohs(oip->ip_len));
193 	/*
194 	 * Defend against mbuf chains shorter than oip->ip_len:
195 	 */
196 	mblen = 0;
197 	for (m = n; m && (mblen < icmplen); m = m->m_next)
198 		mblen += m->m_len;
199 	icmplen = min(mblen, icmplen);
200 
201 	/*
202 	 * As we are not required to return everything we have,
203 	 * we return whatever we can return at ease.
204 	 *
205 	 * Note that ICMP datagrams longer than 576 octets are out of spec
206 	 * according to RFC1812;
207 	 */
208 
209 	KASSERT(ICMP_MINLEN + sizeof (struct ip) <= MCLBYTES);
210 
211 	if (sizeof (struct ip) + icmplen + ICMP_MINLEN > MCLBYTES)
212 		icmplen = MCLBYTES - ICMP_MINLEN - sizeof (struct ip);
213 
214 	m = m_gethdr(M_DONTWAIT, MT_HEADER);
215 	if (m && ((sizeof (struct ip) + icmplen + ICMP_MINLEN +
216 	    sizeof(long) - 1) &~ (sizeof(long) - 1)) > MHLEN) {
217 		MCLGET(m, M_DONTWAIT);
218 		if ((m->m_flags & M_EXT) == 0) {
219 			m_freem(m);
220 			m = NULL;
221 		}
222 	}
223 	if (m == NULL)
224 		goto freeit;
225 	/* keep in same rtable and preserve other pkthdr bits */
226 	m->m_pkthdr.ph_rtableid = n->m_pkthdr.ph_rtableid;
227 	m->m_pkthdr.ph_ifidx = n->m_pkthdr.ph_ifidx;
228 	/* move PF_GENERATED to new packet, if existent XXX preserve more? */
229 	if (n->m_pkthdr.pf.flags & PF_TAG_GENERATED)
230 		m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
231 	m->m_pkthdr.len = m->m_len = icmplen + ICMP_MINLEN;
232 	m_align(m, m->m_len);
233 	icp = mtod(m, struct icmp *);
234 	if ((u_int)type > ICMP_MAXTYPE)
235 		panic("icmp_error");
236 	icmpstat_inc(icps_outhist + type);
237 	icp->icmp_type = type;
238 	if (type == ICMP_REDIRECT)
239 		icp->icmp_gwaddr.s_addr = dest;
240 	else {
241 		icp->icmp_void = 0;
242 		/*
243 		 * The following assignments assume an overlay with the
244 		 * zeroed icmp_void field.
245 		 */
246 		if (type == ICMP_PARAMPROB) {
247 			icp->icmp_pptr = code;
248 			code = 0;
249 		} else if (type == ICMP_UNREACH &&
250 		    code == ICMP_UNREACH_NEEDFRAG && destmtu)
251 			icp->icmp_nextmtu = htons(destmtu);
252 	}
253 
254 	icp->icmp_code = code;
255 	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
256 
257 	/*
258 	 * Now, copy old ip header (without options)
259 	 * in front of icmp message.
260 	 */
261 	m = m_prepend(m, sizeof(struct ip), M_DONTWAIT);
262 	if (m == NULL)
263 		goto freeit;
264 	nip = mtod(m, struct ip *);
265 	/* ip_v set in ip_output */
266 	nip->ip_hl = sizeof(struct ip) >> 2;
267 	nip->ip_tos = 0;
268 	nip->ip_len = htons(m->m_len);
269 	/* ip_id set in ip_output */
270 	nip->ip_off = 0;
271 	/* ip_ttl set in icmp_reflect */
272 	nip->ip_p = IPPROTO_ICMP;
273 	nip->ip_src = oip->ip_src;
274 	nip->ip_dst = oip->ip_dst;
275 
276 	m_freem(n);
277 	return (m);
278 
279 freeit:
280 	m_freem(n);
281 	return (NULL);
282 }
283 
284 /*
285  * Generate an error packet of type error
286  * in response to bad packet ip.
287  *
288  * The ip packet inside has ip_off and ip_len in host byte order.
289  */
290 void
291 icmp_error(struct mbuf *n, int type, int code, u_int32_t dest, int destmtu)
292 {
293 	struct mbuf *m;
294 
295 	m = icmp_do_error(n, type, code, dest, destmtu);
296 	if (m != NULL)
297 		if (!icmp_reflect(m, NULL, NULL))
298 			icmp_send(m, NULL);
299 }
300 
301 /*
302  * Process a received ICMP message.
303  */
304 int
305 icmp_input(struct mbuf **mp, int *offp, int proto, int af)
306 {
307 	struct ifnet *ifp;
308 
309 	ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
310 	if (ifp == NULL) {
311 		m_freemp(mp);
312 		return IPPROTO_DONE;
313 	}
314 
315 	proto = icmp_input_if(ifp, mp, offp, proto, af);
316 	if_put(ifp);
317 	return proto;
318 }
319 
320 int
321 icmp_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto, int af)
322 {
323 	struct mbuf *m = *mp;
324 	int hlen = *offp;
325 	struct icmp *icp;
326 	struct ip *ip = mtod(m, struct ip *);
327 	struct sockaddr_in sin;
328 	int icmplen, i, code;
329 	struct in_ifaddr *ia;
330 	void (*ctlfunc)(int, struct sockaddr *, u_int, void *);
331 	struct mbuf *opts;
332 
333 	/*
334 	 * Locate icmp structure in mbuf, and check
335 	 * that not corrupted and of at least minimum length.
336 	 */
337 	icmplen = ntohs(ip->ip_len) - hlen;
338 #ifdef ICMPPRINTFS
339 	if (icmpprintfs) {
340 		char dst[INET_ADDRSTRLEN], src[INET_ADDRSTRLEN];
341 
342 		inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst));
343 		inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src));
344 
345 		printf("icmp_input from %s to %s, len %d\n", src, dst, icmplen);
346 	}
347 #endif
348 	if (icmplen < ICMP_MINLEN) {
349 		icmpstat_inc(icps_tooshort);
350 		goto freeit;
351 	}
352 	i = hlen + min(icmplen, ICMP_ADVLENMAX);
353 	if ((m = *mp = m_pullup(m, i)) == NULL) {
354 		icmpstat_inc(icps_tooshort);
355 		return IPPROTO_DONE;
356 	}
357 	ip = mtod(m, struct ip *);
358 	if (in4_cksum(m, 0, hlen, icmplen)) {
359 		icmpstat_inc(icps_checksum);
360 		goto freeit;
361 	}
362 
363 	icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
364 #ifdef ICMPPRINTFS
365 	/*
366 	 * Message type specific processing.
367 	 */
368 	if (icmpprintfs)
369 		printf("icmp_input, type %d code %d\n", icp->icmp_type,
370 		    icp->icmp_code);
371 #endif
372 	if (icp->icmp_type > ICMP_MAXTYPE)
373 		goto raw;
374 #if NPF > 0
375 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
376 		switch (icp->icmp_type) {
377 		 /*
378 		  * As pf_icmp_mapping() considers redirects belonging to a
379 		  * diverted connection, we must include it here.
380 		  */
381 		case ICMP_REDIRECT:
382 			/* FALLTHROUGH */
383 		/*
384 		 * These ICMP types map to other connections.  They must be
385 		 * delivered to pr_ctlinput() also for diverted connections.
386 		 */
387 		case ICMP_UNREACH:
388 		case ICMP_TIMXCEED:
389 		case ICMP_PARAMPROB:
390 		case ICMP_SOURCEQUENCH:
391 			/*
392 			 * Do not use the divert-to property of the TCP or UDP
393 			 * rule when doing the PCB lookup for the raw socket.
394 			 */
395 			m->m_pkthdr.pf.flags &=~ PF_TAG_DIVERTED;
396 			break;
397 		default:
398 			goto raw;
399 		}
400 	}
401 #endif /* NPF */
402 	icmpstat_inc(icps_inhist + icp->icmp_type);
403 	code = icp->icmp_code;
404 	switch (icp->icmp_type) {
405 
406 	case ICMP_UNREACH:
407 		switch (code) {
408 		case ICMP_UNREACH_NET:
409 		case ICMP_UNREACH_HOST:
410 		case ICMP_UNREACH_PROTOCOL:
411 		case ICMP_UNREACH_PORT:
412 		case ICMP_UNREACH_SRCFAIL:
413 			code += PRC_UNREACH_NET;
414 			break;
415 
416 		case ICMP_UNREACH_NEEDFRAG:
417 			code = PRC_MSGSIZE;
418 			break;
419 
420 		case ICMP_UNREACH_NET_UNKNOWN:
421 		case ICMP_UNREACH_NET_PROHIB:
422 		case ICMP_UNREACH_TOSNET:
423 			code = PRC_UNREACH_NET;
424 			break;
425 
426 		case ICMP_UNREACH_HOST_UNKNOWN:
427 		case ICMP_UNREACH_ISOLATED:
428 		case ICMP_UNREACH_HOST_PROHIB:
429 		case ICMP_UNREACH_TOSHOST:
430 		case ICMP_UNREACH_FILTER_PROHIB:
431 		case ICMP_UNREACH_HOST_PRECEDENCE:
432 		case ICMP_UNREACH_PRECEDENCE_CUTOFF:
433 			code = PRC_UNREACH_HOST;
434 			break;
435 
436 		default:
437 			goto badcode;
438 		}
439 		goto deliver;
440 
441 	case ICMP_TIMXCEED:
442 		if (code > 1)
443 			goto badcode;
444 		code += PRC_TIMXCEED_INTRANS;
445 		goto deliver;
446 
447 	case ICMP_PARAMPROB:
448 		if (code > 1)
449 			goto badcode;
450 		code = PRC_PARAMPROB;
451 		goto deliver;
452 
453 	case ICMP_SOURCEQUENCH:
454 		if (code)
455 			goto badcode;
456 		code = PRC_QUENCH;
457 	deliver:
458 		/*
459 		 * Problem with datagram; advise higher level routines.
460 		 */
461 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
462 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
463 			icmpstat_inc(icps_badlen);
464 			goto freeit;
465 		}
466 		if (IN_MULTICAST(icp->icmp_ip.ip_dst.s_addr))
467 			goto badcode;
468 #ifdef INET6
469 		/* Get more contiguous data for a v6 in v4 ICMP message. */
470 		if (icp->icmp_ip.ip_p == IPPROTO_IPV6) {
471 			if (icmplen < ICMP_V6ADVLENMIN ||
472 			    icmplen < ICMP_V6ADVLEN(icp)) {
473 				icmpstat_inc(icps_badlen);
474 				goto freeit;
475 			}
476 		}
477 #endif /* INET6 */
478 #ifdef ICMPPRINTFS
479 		if (icmpprintfs)
480 			printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
481 #endif
482 		memset(&sin, 0, sizeof(sin));
483 		sin.sin_family = AF_INET;
484 		sin.sin_len = sizeof(struct sockaddr_in);
485 		sin.sin_addr = icp->icmp_ip.ip_dst;
486 #if NCARP > 0
487 		if (carp_lsdrop(ifp, m, AF_INET, &sin.sin_addr.s_addr,
488 		    &ip->ip_dst.s_addr, 1))
489 			goto freeit;
490 #endif
491 		/*
492 		 * XXX if the packet contains [IPv4 AH TCP], we can't make a
493 		 * notification to TCP layer.
494 		 */
495 		ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
496 		if (ctlfunc)
497 			(*ctlfunc)(code, sintosa(&sin), m->m_pkthdr.ph_rtableid,
498 			    &icp->icmp_ip);
499 		break;
500 
501 	badcode:
502 		icmpstat_inc(icps_badcode);
503 		break;
504 
505 	case ICMP_ECHO:
506 		if (!icmpbmcastecho &&
507 		    (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
508 			icmpstat_inc(icps_bmcastecho);
509 			break;
510 		}
511 		icp->icmp_type = ICMP_ECHOREPLY;
512 		goto reflect;
513 
514 	case ICMP_TSTAMP:
515 		if (icmptstamprepl == 0)
516 			break;
517 
518 		if (!icmpbmcastecho &&
519 		    (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
520 			icmpstat_inc(icps_bmcastecho);
521 			break;
522 		}
523 		if (icmplen < ICMP_TSLEN) {
524 			icmpstat_inc(icps_badlen);
525 			break;
526 		}
527 		icp->icmp_type = ICMP_TSTAMPREPLY;
528 		icp->icmp_rtime = iptime();
529 		icp->icmp_ttime = icp->icmp_rtime;	/* bogus, do later! */
530 		goto reflect;
531 
532 	case ICMP_MASKREQ:
533 		if (icmpmaskrepl == 0)
534 			break;
535 		if (icmplen < ICMP_MASKLEN) {
536 			icmpstat_inc(icps_badlen);
537 			break;
538 		}
539 		/*
540 		 * We are not able to respond with all ones broadcast
541 		 * unless we receive it over a point-to-point interface.
542 		 */
543 		memset(&sin, 0, sizeof(sin));
544 		sin.sin_family = AF_INET;
545 		sin.sin_len = sizeof(struct sockaddr_in);
546 		if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
547 		    ip->ip_dst.s_addr == INADDR_ANY)
548 			sin.sin_addr = ip->ip_src;
549 		else
550 			sin.sin_addr = ip->ip_dst;
551 		if (ifp == NULL)
552 			break;
553 		ia = ifatoia(ifaof_ifpforaddr(sintosa(&sin), ifp));
554 		if (ia == NULL)
555 			break;
556 		icp->icmp_type = ICMP_MASKREPLY;
557 		icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
558 		if (ip->ip_src.s_addr == 0) {
559 			if (ifp->if_flags & IFF_BROADCAST) {
560 				if (ia->ia_broadaddr.sin_addr.s_addr)
561 					ip->ip_src = ia->ia_broadaddr.sin_addr;
562 				else
563 					ip->ip_src.s_addr = INADDR_BROADCAST;
564 			}
565 			else if (ifp->if_flags & IFF_POINTOPOINT)
566 				ip->ip_src = ia->ia_dstaddr.sin_addr;
567 		}
568 reflect:
569 #if NCARP > 0
570 		if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr,
571 		    &ip->ip_dst.s_addr, 1))
572 			goto freeit;
573 #endif
574 		icmpstat_inc(icps_reflect);
575 		icmpstat_inc(icps_outhist + icp->icmp_type);
576 		if (!icmp_reflect(m, &opts, NULL)) {
577 			icmp_send(m, opts);
578 			m_free(opts);
579 		}
580 		return IPPROTO_DONE;
581 
582 	case ICMP_REDIRECT:
583 	{
584 		struct sockaddr_in sdst;
585 		struct sockaddr_in sgw;
586 		struct sockaddr_in ssrc;
587 		struct rtentry *newrt = NULL;
588 
589 		if (icmp_rediraccept == 0 || ipforwarding == 1)
590 			goto freeit;
591 		if (code > 3)
592 			goto badcode;
593 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
594 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
595 			icmpstat_inc(icps_badlen);
596 			break;
597 		}
598 		/*
599 		 * Short circuit routing redirects to force
600 		 * immediate change in the kernel's routing
601 		 * tables.  The message is also handed to anyone
602 		 * listening on a raw socket (e.g. the routing
603 		 * daemon for use in updating its tables).
604 		 */
605 		memset(&sdst, 0, sizeof(sdst));
606 		memset(&sgw, 0, sizeof(sgw));
607 		memset(&ssrc, 0, sizeof(ssrc));
608 		sdst.sin_family = sgw.sin_family = ssrc.sin_family = AF_INET;
609 		sdst.sin_len = sgw.sin_len = ssrc.sin_len = sizeof(sdst);
610 		memcpy(&sdst.sin_addr, &icp->icmp_ip.ip_dst,
611 		    sizeof(sdst.sin_addr));
612 		memcpy(&sgw.sin_addr, &icp->icmp_gwaddr,
613 		    sizeof(sgw.sin_addr));
614 		memcpy(&ssrc.sin_addr, &ip->ip_src,
615 		    sizeof(ssrc.sin_addr));
616 
617 #ifdef	ICMPPRINTFS
618 		if (icmpprintfs) {
619 			char gw[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
620 
621 			inet_ntop(AF_INET, &icp->icmp_gwaddr, gw, sizeof(gw));
622 			inet_ntop(AF_INET, &icp->icmp_ip.ip_dst,
623 			    dst, sizeof(dst));
624 
625 			printf("redirect dst %s to %s\n", dst, gw);
626 		}
627 #endif
628 
629 #if NCARP > 0
630 		if (carp_lsdrop(ifp, m, AF_INET, &sdst.sin_addr.s_addr,
631 		    &ip->ip_dst.s_addr, 1))
632 			goto freeit;
633 #endif
634 		rtredirect(sintosa(&sdst), sintosa(&sgw),
635 		    sintosa(&ssrc), &newrt, m->m_pkthdr.ph_rtableid);
636 		if (newrt != NULL && icmp_redirtimeout != 0) {
637 			(void)rt_timer_add(newrt, icmp_redirect_timeout,
638 			    icmp_redirect_timeout_q, m->m_pkthdr.ph_rtableid);
639 		}
640 		if (newrt != NULL)
641 			rtfree(newrt);
642 		pfctlinput(PRC_REDIRECT_HOST, sintosa(&sdst));
643 		break;
644 	}
645 	/*
646 	 * No kernel processing for the following;
647 	 * just fall through to send to raw listener.
648 	 */
649 	case ICMP_ECHOREPLY:
650 	case ICMP_ROUTERADVERT:
651 	case ICMP_ROUTERSOLICIT:
652 	case ICMP_TSTAMPREPLY:
653 	case ICMP_IREQREPLY:
654 	case ICMP_MASKREPLY:
655 	case ICMP_TRACEROUTE:
656 	case ICMP_DATACONVERR:
657 	case ICMP_MOBILE_REDIRECT:
658 	case ICMP_IPV6_WHEREAREYOU:
659 	case ICMP_IPV6_IAMHERE:
660 	case ICMP_MOBILE_REGREQUEST:
661 	case ICMP_MOBILE_REGREPLY:
662 	case ICMP_PHOTURIS:
663 	default:
664 		break;
665 	}
666 
667 raw:
668 	return rip_input(mp, offp, proto, af);
669 
670 freeit:
671 	m_freem(m);
672 	return IPPROTO_DONE;
673 }
674 
675 /*
676  * Reflect the ip packet back to the source
677  */
678 int
679 icmp_reflect(struct mbuf *m, struct mbuf **op, struct in_ifaddr *ia)
680 {
681 	struct ip *ip = mtod(m, struct ip *);
682 	struct mbuf *opts = NULL;
683 	struct sockaddr_in sin;
684 	struct rtentry *rt = NULL;
685 	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
686 	u_int rtableid;
687 
688 	if (!in_canforward(ip->ip_src) &&
689 	    ((ip->ip_src.s_addr & IN_CLASSA_NET) !=
690 	    htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) {
691 		m_freem(m);		/* Bad return address */
692 		return (EHOSTUNREACH);
693 	}
694 
695 	if (m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) {
696 		m_freem(m);
697 		return (ELOOP);
698 	}
699 	rtableid = m->m_pkthdr.ph_rtableid;
700 	m_resethdr(m);
701 	m->m_pkthdr.ph_rtableid = rtableid;
702 
703 	/*
704 	 * If the incoming packet was addressed directly to us,
705 	 * use dst as the src for the reply.  For broadcast, use
706 	 * the address which corresponds to the incoming interface.
707 	 */
708 	if (ia == NULL) {
709 		memset(&sin, 0, sizeof(sin));
710 		sin.sin_len = sizeof(sin);
711 		sin.sin_family = AF_INET;
712 		sin.sin_addr = ip->ip_dst;
713 
714 		rt = rtalloc(sintosa(&sin), 0, rtableid);
715 		if (rtisvalid(rt) &&
716 		    ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST))
717 			ia = ifatoia(rt->rt_ifa);
718 	}
719 
720 	/*
721 	 * The following happens if the packet was not addressed to us.
722 	 * Use the new source address and do a route lookup. If it fails
723 	 * drop the packet as there is no path to the host.
724 	 */
725 	if (ia == NULL) {
726 		rtfree(rt);
727 
728 		memset(&sin, 0, sizeof(sin));
729 		sin.sin_len = sizeof(sin);
730 		sin.sin_family = AF_INET;
731 		sin.sin_addr = ip->ip_src;
732 
733 		/* keep packet in the original virtual instance */
734 		rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
735 		if (rt == NULL) {
736 			ipstat_inc(ips_noroute);
737 			m_freem(m);
738 			return (EHOSTUNREACH);
739 		}
740 
741 		ia = ifatoia(rt->rt_ifa);
742 	}
743 
744 	ip->ip_dst = ip->ip_src;
745 	ip->ip_ttl = MAXTTL;
746 
747 	/* It is safe to dereference ``ia'' iff ``rt'' is valid. */
748 	ip->ip_src = ia->ia_addr.sin_addr;
749 	rtfree(rt);
750 
751 	if (optlen > 0) {
752 		u_char *cp;
753 		int opt, cnt;
754 		u_int len;
755 
756 		/*
757 		 * Retrieve any source routing from the incoming packet;
758 		 * add on any record-route or timestamp options.
759 		 */
760 		cp = (u_char *) (ip + 1);
761 		if (op && (opts = ip_srcroute(m)) == NULL &&
762 		    (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) {
763 			opts->m_len = sizeof(struct in_addr);
764 			mtod(opts, struct in_addr *)->s_addr = 0;
765 		}
766 		if (op && opts) {
767 #ifdef ICMPPRINTFS
768 			if (icmpprintfs)
769 				printf("icmp_reflect optlen %d rt %d => ",
770 				    optlen, opts->m_len);
771 #endif
772 			for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
773 				opt = cp[IPOPT_OPTVAL];
774 				if (opt == IPOPT_EOL)
775 					break;
776 				if (opt == IPOPT_NOP)
777 					len = 1;
778 				else {
779 					if (cnt < IPOPT_OLEN + sizeof(*cp))
780 						break;
781 					len = cp[IPOPT_OLEN];
782 					if (len < IPOPT_OLEN + sizeof(*cp) ||
783 					    len > cnt)
784 						break;
785 				}
786 				/*
787 				 * Should check for overflow, but it
788 				 * "can't happen"
789 				 */
790 				if (opt == IPOPT_RR || opt == IPOPT_TS ||
791 				    opt == IPOPT_SECURITY) {
792 					memcpy(mtod(opts, caddr_t) +
793 					    opts->m_len, cp, len);
794 					opts->m_len += len;
795 				}
796 			}
797 			/* Terminate & pad, if necessary */
798 			if ((cnt = opts->m_len % 4) != 0)
799 				for (; cnt < 4; cnt++) {
800 					*(mtod(opts, caddr_t) + opts->m_len) =
801 					    IPOPT_EOL;
802 					opts->m_len++;
803 				}
804 #ifdef ICMPPRINTFS
805 			if (icmpprintfs)
806 				printf("%d\n", opts->m_len);
807 #endif
808 		}
809 		ip_stripoptions(m);
810 	}
811 	m->m_flags &= ~(M_BCAST|M_MCAST);
812 	if (op)
813 		*op = opts;
814 
815 	return (0);
816 }
817 
818 /*
819  * Send an icmp packet back to the ip level
820  */
821 void
822 icmp_send(struct mbuf *m, struct mbuf *opts)
823 {
824 	struct ip *ip = mtod(m, struct ip *);
825 	int hlen;
826 	struct icmp *icp;
827 
828 	hlen = ip->ip_hl << 2;
829 	icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
830 	icp->icmp_cksum = 0;
831 	m->m_pkthdr.csum_flags = M_ICMP_CSUM_OUT;
832 #ifdef ICMPPRINTFS
833 	if (icmpprintfs) {
834 		char dst[INET_ADDRSTRLEN], src[INET_ADDRSTRLEN];
835 
836 		inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst));
837 		inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src));
838 
839 		printf("icmp_send dst %s src %s\n", dst, src);
840 	}
841 #endif
842 	if (opts != NULL)
843 		m = ip_insertoptions(m, opts, &hlen);
844 
845 	ip_send(m);
846 }
847 
848 u_int32_t
849 iptime(void)
850 {
851 	struct timeval atv;
852 	u_long t;
853 
854 	microtime(&atv);
855 	t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
856 	return (htonl(t));
857 }
858 
859 int
860 icmp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
861     size_t newlen)
862 {
863 	int error;
864 
865 	/* All sysctl names at this level are terminal. */
866 	if (namelen != 1)
867 		return (ENOTDIR);
868 
869 	switch (name[0]) {
870 	case ICMPCTL_REDIRTIMEOUT:
871 
872 		NET_LOCK();
873 		error = sysctl_int(oldp, oldlenp, newp, newlen,
874 		    &icmp_redirtimeout);
875 		if (icmp_redirect_timeout_q != NULL) {
876 			if (icmp_redirtimeout == 0) {
877 				rt_timer_queue_destroy(icmp_redirect_timeout_q);
878 				icmp_redirect_timeout_q = NULL;
879 			} else
880 				rt_timer_queue_change(icmp_redirect_timeout_q,
881 				    icmp_redirtimeout);
882 		} else if (icmp_redirtimeout > 0) {
883 			icmp_redirect_timeout_q =
884 			    rt_timer_queue_create(icmp_redirtimeout);
885 		}
886 		NET_UNLOCK();
887 		break;
888 
889 	case ICMPCTL_STATS:
890 		error = icmp_sysctl_icmpstat(oldp, oldlenp, newp);
891 		break;
892 
893 	default:
894 		if (name[0] < ICMPCTL_MAXID) {
895 			NET_LOCK();
896 			error = sysctl_int_arr(icmpctl_vars, name, namelen,
897 			    oldp, oldlenp, newp, newlen);
898 			NET_UNLOCK();
899 			break;
900 		}
901 		error = ENOPROTOOPT;
902 		break;
903 	}
904 
905 	return (error);
906 }
907 
908 int
909 icmp_sysctl_icmpstat(void *oldp, size_t *oldlenp, void *newp)
910 {
911 	uint64_t counters[icps_ncounters];
912 	struct icmpstat icmpstat;
913 	u_long *words = (u_long *)&icmpstat;
914 	int i;
915 
916 	CTASSERT(sizeof(icmpstat) == (nitems(counters) * sizeof(u_long)));
917 	memset(&icmpstat, 0, sizeof icmpstat);
918 	counters_read(icmpcounters, counters, nitems(counters));
919 
920 	for (i = 0; i < nitems(counters); i++)
921 		words[i] = (u_long)counters[i];
922 
923 	return (sysctl_rdstruct(oldp, oldlenp, newp,
924 	    &icmpstat, sizeof(icmpstat)));
925 }
926 
927 struct rtentry *
928 icmp_mtudisc_clone(struct in_addr dst, u_int rtableid)
929 {
930 	struct sockaddr_in sin;
931 	struct rtentry *rt;
932 	int error;
933 
934 	memset(&sin, 0, sizeof(sin));
935 	sin.sin_family = AF_INET;
936 	sin.sin_len = sizeof(sin);
937 	sin.sin_addr = dst;
938 
939 	rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
940 
941 	/* Check if the route is actually usable */
942 	if (!rtisvalid(rt) || (rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)))
943 		goto bad;
944 
945 	/*
946 	 * No PMTU for local routes and permanent neighbors,
947 	 * ARP and NDP use the same expire timer as the route.
948 	 */
949 	if (ISSET(rt->rt_flags, RTF_LOCAL) ||
950 	    (ISSET(rt->rt_flags, RTF_LLINFO) && rt->rt_expire == 0))
951 		goto bad;
952 
953 	/* If we didn't get a host route, allocate one */
954 	if ((rt->rt_flags & RTF_HOST) == 0) {
955 		struct rtentry *nrt;
956 		struct rt_addrinfo info;
957 		struct sockaddr_rtlabel sa_rl;
958 
959 		memset(&info, 0, sizeof(info));
960 		info.rti_ifa = rt->rt_ifa;
961 		info.rti_flags = RTF_GATEWAY | RTF_HOST | RTF_DYNAMIC;
962 		info.rti_info[RTAX_DST] = sintosa(&sin);
963 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
964 		info.rti_info[RTAX_LABEL] =
965 		    rtlabel_id2sa(rt->rt_labelid, &sa_rl);
966 
967 		error = rtrequest(RTM_ADD, &info, rt->rt_priority, &nrt,
968 		    rtableid);
969 		if (error)
970 			goto bad;
971 		nrt->rt_rmx = rt->rt_rmx;
972 		rtfree(rt);
973 		rt = nrt;
974 		rtm_send(rt, RTM_ADD, 0, rtableid);
975 	}
976 	error = rt_timer_add(rt, icmp_mtudisc_timeout, ip_mtudisc_timeout_q,
977 	    rtableid);
978 	if (error)
979 		goto bad;
980 
981 	return (rt);
982 bad:
983 	rtfree(rt);
984 	return (NULL);
985 }
986 
987 /* Table of common MTUs: */
988 static const u_short mtu_table[] = {
989 	65535, 65280, 32000, 17914, 9180, 8166,
990 	4352, 2002, 1492, 1006, 508, 296, 68, 0
991 };
992 
993 void
994 icmp_mtudisc(struct icmp *icp, u_int rtableid)
995 {
996 	struct rtentry *rt;
997 	struct ifnet *ifp;
998 	u_long mtu = ntohs(icp->icmp_nextmtu);  /* Why a long?  IPv6 */
999 
1000 	rt = icmp_mtudisc_clone(icp->icmp_ip.ip_dst, rtableid);
1001 	if (rt == NULL)
1002 		return;
1003 
1004 	ifp = if_get(rt->rt_ifidx);
1005 	if (ifp == NULL) {
1006 		rtfree(rt);
1007 		return;
1008 	}
1009 
1010 	if (mtu == 0) {
1011 		int i = 0;
1012 
1013 		mtu = ntohs(icp->icmp_ip.ip_len);
1014 		/* Some 4.2BSD-based routers incorrectly adjust the ip_len */
1015 		if (mtu > rt->rt_mtu && rt->rt_mtu != 0)
1016 			mtu -= (icp->icmp_ip.ip_hl << 2);
1017 
1018 		/* If we still can't guess a value, try the route */
1019 		if (mtu == 0) {
1020 			mtu = rt->rt_mtu;
1021 
1022 			/* If no route mtu, default to the interface mtu */
1023 
1024 			if (mtu == 0)
1025 				mtu = ifp->if_mtu;
1026 		}
1027 
1028 		for (i = 0; i < nitems(mtu_table); i++)
1029 			if (mtu > mtu_table[i]) {
1030 				mtu = mtu_table[i];
1031 				break;
1032 			}
1033 	}
1034 
1035 	/*
1036 	 * XXX:   RTV_MTU is overloaded, since the admin can set it
1037 	 *	  to turn off PMTU for a route, and the kernel can
1038 	 *	  set it to indicate a serious problem with PMTU
1039 	 *	  on a route.  We should be using a separate flag
1040 	 *	  for the kernel to indicate this.
1041 	 */
1042 	if ((rt->rt_locks & RTV_MTU) == 0) {
1043 		if (mtu < 296 || mtu > ifp->if_mtu)
1044 			rt->rt_locks |= RTV_MTU;
1045 		else if (rt->rt_mtu > mtu || rt->rt_mtu == 0)
1046 			rt->rt_mtu = mtu;
1047 	}
1048 
1049 	if_put(ifp);
1050 	rtfree(rt);
1051 }
1052 
1053 void
1054 icmp_mtudisc_timeout(struct rtentry *rt, struct rttimer *r)
1055 {
1056 	struct ifnet *ifp;
1057 
1058 	NET_ASSERT_LOCKED();
1059 
1060 	ifp = if_get(rt->rt_ifidx);
1061 	if (ifp == NULL)
1062 		return;
1063 
1064 	if ((rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) == (RTF_DYNAMIC|RTF_HOST)) {
1065 		void (*ctlfunc)(int, struct sockaddr *, u_int, void *);
1066 		struct sockaddr_in sin;
1067 
1068 		sin = *satosin(rt_key(rt));
1069 
1070 		rtdeletemsg(rt, ifp, r->rtt_tableid);
1071 
1072 		/* Notify TCP layer of increased Path MTU estimate */
1073 		ctlfunc = inetsw[ip_protox[IPPROTO_TCP]].pr_ctlinput;
1074 		if (ctlfunc)
1075 			(*ctlfunc)(PRC_MTUINC, sintosa(&sin),
1076 			    r->rtt_tableid, NULL);
1077 	} else {
1078 		if ((rt->rt_locks & RTV_MTU) == 0)
1079 			rt->rt_mtu = 0;
1080 	}
1081 
1082 	if_put(ifp);
1083 }
1084 
1085 /*
1086  * Perform rate limit check.
1087  * Returns 0 if it is okay to send the icmp packet.
1088  * Returns 1 if the router SHOULD NOT send this icmp packet due to rate
1089  * limitation.
1090  *
1091  * XXX per-destination/type check necessary?
1092  */
1093 int
1094 icmp_ratelimit(const struct in_addr *dst, const int type, const int code)
1095 {
1096 	/* PPS limit */
1097 	if (!ppsratecheck(&icmperrppslim_last, &icmperrpps_count,
1098 	    icmperrppslim))
1099 		return 1;	/* The packet is subject to rate limit */
1100 	return 0;	/* okay to send */
1101 }
1102 
1103 void
1104 icmp_redirect_timeout(struct rtentry *rt, struct rttimer *r)
1105 {
1106 	struct ifnet *ifp;
1107 
1108 	NET_ASSERT_LOCKED();
1109 
1110 	ifp = if_get(rt->rt_ifidx);
1111 	if (ifp == NULL)
1112 		return;
1113 
1114 	if ((rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) == (RTF_DYNAMIC|RTF_HOST)) {
1115 		rtdeletemsg(rt, ifp, r->rtt_tableid);
1116 	}
1117 
1118 	if_put(ifp);
1119 }
1120 
1121 int
1122 icmp_do_exthdr(struct mbuf *m, u_int16_t class, u_int8_t ctype, void *buf,
1123     size_t len)
1124 {
1125 	struct ip *ip = mtod(m, struct ip *);
1126 	int hlen, off;
1127 	struct mbuf *n;
1128 	struct icmp *icp;
1129 	struct icmp_ext_hdr *ieh;
1130 	struct {
1131 		struct icmp_ext_hdr	ieh;
1132 		struct icmp_ext_obj_hdr	ieo;
1133 	} hdr;
1134 
1135 	hlen = ip->ip_hl << 2;
1136 	icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
1137 	if (icp->icmp_type != ICMP_TIMXCEED && icp->icmp_type != ICMP_UNREACH &&
1138 	    icp->icmp_type != ICMP_PARAMPROB)
1139 		/* exthdr not supported */
1140 		return (0);
1141 
1142 	if (icp->icmp_length != 0)
1143 		/* exthdr already present, giving up */
1144 		return (0);
1145 
1146 	/* the actual offset starts after the common ICMP header */
1147 	hlen += ICMP_MINLEN;
1148 	/* exthdr must start on a word boundary */
1149 	off = roundup(ntohs(ip->ip_len) - hlen, sizeof(u_int32_t));
1150 	/* ... and at an offset of ICMP_EXT_OFFSET or bigger */
1151 	off = max(off, ICMP_EXT_OFFSET);
1152 	icp->icmp_length = off / sizeof(u_int32_t);
1153 
1154 	memset(&hdr, 0, sizeof(hdr));
1155 	hdr.ieh.ieh_version = ICMP_EXT_HDR_VERSION;
1156 	hdr.ieo.ieo_length = htons(sizeof(struct icmp_ext_obj_hdr) + len);
1157 	hdr.ieo.ieo_cnum = class;
1158 	hdr.ieo.ieo_ctype = ctype;
1159 
1160 	if (m_copyback(m, hlen + off, sizeof(hdr), &hdr, M_NOWAIT) ||
1161 	    m_copyback(m, hlen + off + sizeof(hdr), len, buf, M_NOWAIT)) {
1162 		m_freem(m);
1163 		return (ENOBUFS);
1164 	}
1165 
1166 	/* calculate checksum */
1167 	n = m_getptr(m, hlen + off, &off);
1168 	if (n == NULL)
1169 		panic("icmp_do_exthdr: m_getptr failure");
1170 	ieh = (struct icmp_ext_hdr *)(mtod(n, caddr_t) + off);
1171 	ieh->ieh_cksum = in4_cksum(n, 0, off, sizeof(hdr) + len);
1172 
1173 	ip->ip_len = htons(m->m_pkthdr.len);
1174 
1175 	return (0);
1176 }
1177