xref: /openbsd/sys/netinet/ip_input.c (revision d485f761)
1 /*	$OpenBSD: ip_input.c,v 1.93 2001/09/18 15:24:32 aaron Exp $	*/
2 /*	$NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
37  */
38 
39 #include "pf.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/syslog.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/if_ether.h>
57 #include <netinet/ip.h>
58 #include <netinet/in_pcb.h>
59 #include <netinet/in_var.h>
60 #include <netinet/ip_var.h>
61 #include <netinet/ip_icmp.h>
62 
63 #if NPF > 0
64 #include <net/pfvar.h>
65 #endif
66 
67 #ifdef IPSEC
68 #include <netinet/ip_ipsp.h>
69 #endif /* IPSEC */
70 
71 #ifndef	IPFORWARDING
72 #ifdef GATEWAY
73 #define	IPFORWARDING	1	/* forward IP packets not for us */
74 #else /* GATEWAY */
75 #define	IPFORWARDING	0	/* don't forward IP packets not for us */
76 #endif /* GATEWAY */
77 #endif /* IPFORWARDING */
78 #ifndef	IPSENDREDIRECTS
79 #define	IPSENDREDIRECTS	1
80 #endif
81 
82 #ifndef IPMTUDISC
83 #define IPMTUDISC	1
84 #endif
85 #ifndef IPMTUDISCTIMEOUT
86 #define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
87 #endif
88 
89 int encdebug = 0;
90 int ipsec_keep_invalid = IPSEC_DEFAULT_EMBRYONIC_SA_TIMEOUT;
91 int ipsec_require_pfs = IPSEC_DEFAULT_PFS;
92 int ipsec_soft_allocations = IPSEC_DEFAULT_SOFT_ALLOCATIONS;
93 int ipsec_exp_allocations = IPSEC_DEFAULT_EXP_ALLOCATIONS;
94 int ipsec_soft_bytes = IPSEC_DEFAULT_SOFT_BYTES;
95 int ipsec_exp_bytes = IPSEC_DEFAULT_EXP_BYTES;
96 int ipsec_soft_timeout = IPSEC_DEFAULT_SOFT_TIMEOUT;
97 int ipsec_exp_timeout = IPSEC_DEFAULT_EXP_TIMEOUT;
98 int ipsec_soft_first_use = IPSEC_DEFAULT_SOFT_FIRST_USE;
99 int ipsec_exp_first_use = IPSEC_DEFAULT_EXP_FIRST_USE;
100 int ipsec_expire_acquire = IPSEC_DEFAULT_EXPIRE_ACQUIRE;
101 char ipsec_def_enc[20];
102 char ipsec_def_auth[20];
103 char ipsec_def_comp[20];
104 
105 /*
106  * Note: DIRECTED_BROADCAST is handled this way so that previous
107  * configuration using this option will Just Work.
108  */
109 #ifndef IPDIRECTEDBCAST
110 #ifdef DIRECTED_BROADCAST
111 #define IPDIRECTEDBCAST	1
112 #else
113 #define	IPDIRECTEDBCAST	0
114 #endif /* DIRECTED_BROADCAST */
115 #endif /* IPDIRECTEDBCAST */
116 int	ipforwarding = IPFORWARDING;
117 int	ipsendredirects = IPSENDREDIRECTS;
118 int	ip_dosourceroute = 0;	/* no src-routing unless sysctl'd to enable */
119 int	ip_defttl = IPDEFTTL;
120 int	ip_mtudisc = IPMTUDISC;
121 u_int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
122 int	ip_directedbcast = IPDIRECTEDBCAST;
123 #ifdef DIAGNOSTIC
124 int	ipprintfs = 0;
125 #endif
126 
127 struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
128 
129 int	ipsec_auth_default_level = IPSEC_AUTH_LEVEL_DEFAULT;
130 int	ipsec_esp_trans_default_level = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
131 int	ipsec_esp_network_default_level = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
132 int	ipsec_ipcomp_default_level = IPSEC_IPCOMP_LEVEL_DEFAULT;
133 
134 /* Keep track of memory used for reassembly */
135 int	ip_maxqueue = 300;
136 int	ip_frags = 0;
137 
138 /* from in_pcb.c */
139 extern int ipport_firstauto;
140 extern int ipport_lastauto;
141 extern int ipport_hifirstauto;
142 extern int ipport_hilastauto;
143 extern struct baddynamicports baddynamicports;
144 
145 extern	struct domain inetdomain;
146 extern	struct protosw inetsw[];
147 u_char	ip_protox[IPPROTO_MAX];
148 int	ipqmaxlen = IFQ_MAXLEN;
149 struct	in_ifaddrhead in_ifaddr;
150 struct	ifqueue ipintrq;
151 
152 int	ipq_locked;
153 static __inline int ipq_lock_try __P((void));
154 static __inline void ipq_unlock __P((void));
155 
156 struct pool ipqent_pool;
157 
158 static __inline int
159 ipq_lock_try()
160 {
161 	int s;
162 
163 	s = splimp();
164 	if (ipq_locked) {
165 		splx(s);
166 		return (0);
167 	}
168 	ipq_locked = 1;
169 	splx(s);
170 	return (1);
171 }
172 
173 #define ipq_lock() ipq_lock_try()
174 
175 static __inline void
176 ipq_unlock()
177 {
178 	int s;
179 
180 	s = splimp();
181 	ipq_locked = 0;
182 	splx(s);
183 }
184 
185 char *
186 inet_ntoa(ina)
187 	struct in_addr ina;
188 {
189 	static char buf[4*sizeof "123"];
190 	unsigned char *ucp = (unsigned char *)&ina;
191 
192 	sprintf(buf, "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff,
193 	    ucp[2] & 0xff, ucp[3] & 0xff);
194 	return (buf);
195 }
196 
197 /*
198  * We need to save the IP options in case a protocol wants to respond
199  * to an incoming packet over the same route if the packet got here
200  * using IP source routing.  This allows connection establishment and
201  * maintenance when the remote end is on a network that is not known
202  * to us.
203  */
204 int	ip_nhops = 0;
205 static	struct ip_srcrt {
206 	struct	in_addr dst;			/* final destination */
207 	char	nop;				/* one NOP to align */
208 	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
209 	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
210 } ip_srcrt;
211 
212 static void save_rte __P((u_char *, struct in_addr));
213 static int ip_weadvertise(u_int32_t);
214 
215 /*
216  * IP initialization: fill in IP protocol switch table.
217  * All protocols not implemented in kernel go to raw IP protocol handler.
218  */
219 void
220 ip_init()
221 {
222 	register struct protosw *pr;
223 	register int i;
224 	const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP;
225 	const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP;
226 
227 	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
228 	    0, NULL, NULL, M_IPQ);
229 
230 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
231 	if (pr == 0)
232 		panic("ip_init");
233 	for (i = 0; i < IPPROTO_MAX; i++)
234 		ip_protox[i] = pr - inetsw;
235 	for (pr = inetdomain.dom_protosw;
236 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
237 		if (pr->pr_domain->dom_family == PF_INET &&
238 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
239 			ip_protox[pr->pr_protocol] = pr - inetsw;
240 	LIST_INIT(&ipq);
241 	ipintrq.ifq_maxlen = ipqmaxlen;
242 	TAILQ_INIT(&in_ifaddr);
243 	if (ip_mtudisc != 0)
244 		ip_mtudisc_timeout_q =
245 		    rt_timer_queue_create(ip_mtudisc_timeout);
246 
247 	/* Fill in list of ports not to allocate dynamically. */
248 	bzero((void *)&baddynamicports, sizeof(baddynamicports));
249 	for (i = 0; defbaddynamicports_tcp[i] != 0; i++)
250 		DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]);
251 	for (i = 0; defbaddynamicports_udp[i] != 0; i++)
252 		DP_SET(baddynamicports.udp, defbaddynamicports_tcp[i]);
253 
254 	strncpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
255 	strncpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
256 	strncpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
257 }
258 
259 struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
260 struct	route ipforward_rt;
261 
262 void
263 ipintr()
264 {
265 	register struct mbuf *m;
266 	int s;
267 
268 	if (needqueuedrain)
269 		m_reclaim();
270 
271 	while (1) {
272 		/*
273 		 * Get next datagram off input queue and get IP header
274 		 * in first mbuf.
275 		 */
276 		s = splimp();
277 		IF_DEQUEUE(&ipintrq, m);
278 		splx(s);
279 		if (m == 0)
280 			return;
281 #ifdef	DIAGNOSTIC
282 		if ((m->m_flags & M_PKTHDR) == 0)
283 			panic("ipintr no HDR");
284 #endif
285 		ipv4_input(m);
286 	}
287 }
288 
289 /*
290  * Ip input routine.  Checksum and byte swap header.  If fragmented
291  * try to reassemble.  Process options.  Pass to next level.
292  */
293 void
294 ipv4_input(m)
295 	struct mbuf *m;
296 {
297 	register struct ip *ip;
298 	register struct ipq *fp;
299 	struct in_ifaddr *ia;
300 	struct ipqent *ipqe;
301 	int hlen, mff;
302 #ifdef IPSEC
303 	int error, s;
304 	struct tdb *tdb;
305 	struct tdb_ident *tdbi;
306 	struct m_tag *mtag;
307 #endif /* IPSEC */
308 
309 	/*
310 	 * If no IP addresses have been set yet but the interfaces
311 	 * are receiving, can't do anything with incoming packets yet.
312 	 */
313 	if (in_ifaddr.tqh_first == 0)
314 		goto bad;
315 	ipstat.ips_total++;
316 	if (m->m_len < sizeof (struct ip) &&
317 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
318 		ipstat.ips_toosmall++;
319 		return;
320 	}
321 	ip = mtod(m, struct ip *);
322 	if (ip->ip_v != IPVERSION) {
323 		ipstat.ips_badvers++;
324 		goto bad;
325 	}
326 	hlen = ip->ip_hl << 2;
327 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
328 		ipstat.ips_badhlen++;
329 		goto bad;
330 	}
331 	if (hlen > m->m_len) {
332 		if ((m = m_pullup(m, hlen)) == NULL) {
333 			ipstat.ips_badhlen++;
334 			return;
335 		}
336 		ip = mtod(m, struct ip *);
337 	}
338 
339 	/* 127/8 must not appear on wire - RFC1122 */
340 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
341 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
342 		if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
343 			ipstat.ips_badaddr++;
344 			goto bad;
345 		}
346 	}
347 
348 	if ((m->m_pkthdr.csum & M_IPV4_CSUM_IN_OK) == 0) {
349 		if (m->m_pkthdr.csum & M_IPV4_CSUM_IN_BAD) {
350 			ipstat.ips_inhwcsum++;
351 			ipstat.ips_badsum++;
352 			goto bad;
353 		}
354 
355 		if (in_cksum(m, hlen) != 0) {
356 			ipstat.ips_badsum++;
357 			goto bad;
358 		}
359 	} else {
360 		m->m_pkthdr.csum &= ~M_IPV4_CSUM_IN_OK;
361 		ipstat.ips_inhwcsum++;
362 	}
363 
364 	/*
365 	 * Convert fields to host representation.
366 	 */
367 	NTOHS(ip->ip_len);
368 	if (ip->ip_len < hlen) {
369 		ipstat.ips_badlen++;
370 		goto bad;
371 	}
372 	NTOHS(ip->ip_off);
373 
374 	/*
375 	 * Check that the amount of data in the buffers
376 	 * is at least as much as the IP header would have us expect.
377 	 * Trim mbufs if longer than we expect.
378 	 * Drop packet if shorter than we expect.
379 	 */
380 	if (m->m_pkthdr.len < ip->ip_len) {
381 		ipstat.ips_tooshort++;
382 		goto bad;
383 	}
384 	if (m->m_pkthdr.len > ip->ip_len) {
385 		if (m->m_len == m->m_pkthdr.len) {
386 			m->m_len = ip->ip_len;
387 			m->m_pkthdr.len = ip->ip_len;
388 		} else
389 			m_adj(m, ip->ip_len - m->m_pkthdr.len);
390 	}
391 
392 #if NPF > 0
393 	/*
394 	 * Packet filter
395 	 */
396 	if (pf_test(PF_IN, m->m_pkthdr.rcvif, &m) != PF_PASS)
397 		goto bad;
398 
399 	ip = mtod(m, struct ip *);
400 	hlen = ip->ip_hl << 2;
401 #endif
402 
403 #ifdef ALTQ
404 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
405 		/* packet is dropped by traffic conditioner */
406 		return;
407 #endif
408 
409 	/*
410 	 * Process options and, if not destined for us,
411 	 * ship it on.  ip_dooptions returns 1 when an
412 	 * error was detected (causing an icmp message
413 	 * to be sent and the original packet to be freed).
414 	 */
415 	ip_nhops = 0;		/* for source routed packets */
416 	if (hlen > sizeof (struct ip) && ip_dooptions(m)) {
417 	        return;
418 	}
419 
420 	/*
421 	 * Check our list of addresses, to see if the packet is for us.
422 	 */
423 	if ((ia = in_iawithaddr(ip->ip_dst, m)) != NULL &&
424 	    (ia->ia_ifp->if_flags & IFF_UP))
425 		goto ours;
426 
427 	if (IN_MULTICAST(ip->ip_dst.s_addr)) {
428 		struct in_multi *inm;
429 #ifdef MROUTING
430 		extern struct socket *ip_mrouter;
431 
432 		if (m->m_flags & M_EXT) {
433 			if ((m = m_pullup(m, hlen)) == NULL) {
434 				ipstat.ips_toosmall++;
435 				return;
436 			}
437 			ip = mtod(m, struct ip *);
438 		}
439 
440 		if (ip_mrouter) {
441 			/*
442 			 * If we are acting as a multicast router, all
443 			 * incoming multicast packets are passed to the
444 			 * kernel-level multicast forwarding function.
445 			 * The packet is returned (relatively) intact; if
446 			 * ip_mforward() returns a non-zero value, the packet
447 			 * must be discarded, else it may be accepted below.
448 			 *
449 			 * (The IP ident field is put in the same byte order
450 			 * as expected when ip_mforward() is called from
451 			 * ip_output().)
452 			 */
453 			if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) {
454 				ipstat.ips_cantforward++;
455 				m_freem(m);
456 				return;
457 			}
458 
459 			/*
460 			 * The process-level routing demon needs to receive
461 			 * all multicast IGMP packets, whether or not this
462 			 * host belongs to their destination groups.
463 			 */
464 			if (ip->ip_p == IPPROTO_IGMP)
465 				goto ours;
466 			ipstat.ips_forward++;
467 		}
468 #endif
469 		/*
470 		 * See if we belong to the destination multicast group on the
471 		 * arrival interface.
472 		 */
473 		IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
474 		if (inm == NULL) {
475 			ipstat.ips_cantforward++;
476 			m_freem(m);
477 			return;
478 		}
479 		goto ours;
480 	}
481 	if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
482 	    ip->ip_dst.s_addr == INADDR_ANY)
483 		goto ours;
484 
485 	/*
486 	 * Not for us; forward if possible and desirable.
487 	 */
488 	if (ipforwarding == 0) {
489 		ipstat.ips_cantforward++;
490 		m_freem(m);
491 	} else {
492 #ifdef IPSEC
493 	        /* IPsec policy check for forwarded packets */
494 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
495                 s = splnet();
496 		if (mtag != NULL) {
497 			tdbi = (struct tdb_ident *)(mtag + 1);
498 			tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
499 		} else
500 			tdb = NULL;
501 	        ipsp_spd_lookup(m, AF_INET, hlen, &error,
502 		    IPSP_DIRECTION_IN, tdb, NULL);
503                 splx(s);
504 
505 		/* Error or otherwise drop-packet indication */
506 		if (error) {
507 			ipstat.ips_cantforward++;
508 			m_freem(m);
509 			return;
510 		}
511 
512 		/* Fall through, forward packet */
513 #endif /* IPSEC */
514 
515 		ip_forward(m, 0);
516 	}
517 	return;
518 
519 ours:
520 	/*
521 	 * If offset or IP_MF are set, must reassemble.
522 	 * Otherwise, nothing need be done.
523 	 * (We could look in the reassembly queue to see
524 	 * if the packet was previously fragmented,
525 	 * but it's not worth the time; just let them time out.)
526 	 */
527 	if (ip->ip_off &~ (IP_DF | IP_RF)) {
528 		if (m->m_flags & M_EXT) {		/* XXX */
529 			if ((m = m_pullup(m, hlen)) == NULL) {
530 				ipstat.ips_toosmall++;
531 				return;
532 			}
533 			ip = mtod(m, struct ip *);
534 		}
535 
536 		/*
537 		 * Look for queue of fragments
538 		 * of this datagram.
539 		 */
540 		ipq_lock();
541 		for (fp = ipq.lh_first; fp != NULL; fp = fp->ipq_q.le_next)
542 			if (ip->ip_id == fp->ipq_id &&
543 			    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
544 			    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
545 			    ip->ip_p == fp->ipq_p)
546 				goto found;
547 		fp = 0;
548 found:
549 
550 		/*
551 		 * Adjust ip_len to not reflect header,
552 		 * set ipqe_mff if more fragments are expected,
553 		 * convert offset of this to bytes.
554 		 */
555 		ip->ip_len -= hlen;
556 		mff = (ip->ip_off & IP_MF) != 0;
557 		if (mff) {
558 			/*
559 			 * Make sure that fragments have a data length
560 			 * that's a non-zero multiple of 8 bytes.
561 			 */
562 			if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
563 				ipstat.ips_badfrags++;
564 				ipq_unlock();
565 				goto bad;
566 			}
567 		}
568 		ip->ip_off <<= 3;
569 
570 		/*
571 		 * If datagram marked as having more fragments
572 		 * or if this is not the first fragment,
573 		 * attempt reassembly; if it succeeds, proceed.
574 		 */
575 		if (mff || ip->ip_off) {
576 			ipstat.ips_fragments++;
577 			if (ip_frags + 1 > ip_maxqueue) {
578 				ip_flush();
579 				ipstat.ips_rcvmemdrop++;
580 				ipq_unlock();
581 				goto bad;
582 			}
583 
584 			ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
585 			if (ipqe == NULL) {
586 				ipstat.ips_rcvmemdrop++;
587 				ipq_unlock();
588 				goto bad;
589 			}
590 			ip_frags++;
591 			ipqe->ipqe_mff = mff;
592 			ipqe->ipqe_m = m;
593 			ipqe->ipqe_ip = ip;
594 			m = ip_reass(ipqe, fp);
595 			if (m == 0) {
596 				ipq_unlock();
597 				return;
598 			}
599 			ipstat.ips_reassembled++;
600 			ip = mtod(m, struct ip *);
601 			hlen = ip->ip_hl << 2;
602 		} else
603 			if (fp)
604 				ip_freef(fp);
605 		ipq_unlock();
606 	} else
607 		ip->ip_len -= hlen;
608 
609 #ifdef IPSEC
610         /*
611          * If it's a protected packet for us, skip the policy check.
612          * That's because we really only care about the properties of
613          * the protected packet, and not the intermediate versions.
614          * While this is not the most paranoid setting, it allows
615          * some flexibility in handling of nested tunnels etc.
616          */
617         if ((ip->ip_p == IPPROTO_ESP) || (ip->ip_p == IPPROTO_AH) ||
618 	    (ip->ip_p == IPPROTO_IPCOMP))
619           goto skipipsec;
620 
621 	/*
622 	 * If the protected packet was tunneled, then we need to
623 	 * verify the protected packet's information, not the
624 	 * external headers. Thus, skip the policy lookup for the
625 	 * external packet, and keep the IPsec information linked on
626 	 * the packet header (the encapsulation routines know how
627 	 * to deal with that).
628 	 */
629 	if ((ip->ip_p == IPPROTO_IPIP) || (ip->ip_p == IPPROTO_IPV6))
630 	  goto skipipsec;
631 
632 	/*
633 	 * If the protected packet is TCP or UDP, we'll do the
634 	 * policy check in the respective input routine, so we can
635 	 * check for bypass sockets.
636 	 */
637 	if ((ip->ip_p == IPPROTO_TCP) || (ip->ip_p == IPPROTO_UDP))
638 	  goto skipipsec;
639 
640 	/* IPsec policy check for local-delivery packets */
641 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
642         s = splnet();
643 	if (mtag) {
644 		tdbi = (struct tdb_ident *)(mtag + 1);
645 	        tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
646 	} else
647 		tdb = NULL;
648 	ipsp_spd_lookup(m, AF_INET, hlen, &error, IPSP_DIRECTION_IN,
649 	    tdb, NULL);
650         splx(s);
651 
652 	/* Error or otherwise drop-packet indication */
653 	if (error) {
654 	        ipstat.ips_cantforward++;
655 		m_freem(m);
656 		return;
657 	}
658 
659  skipipsec:
660 	/* Otherwise, just fall through and deliver the packet */
661 #endif /* IPSEC */
662 
663 	/*
664 	 * Switch out to protocol's input routine.
665 	 */
666 	ipstat.ips_delivered++;
667 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen, NULL, 0);
668 	return;
669 bad:
670 	m_freem(m);
671 }
672 
673 struct in_ifaddr *
674 in_iawithaddr(ina, m)
675 	struct in_addr ina;
676 	register struct mbuf *m;
677 {
678 	register struct in_ifaddr *ia;
679 
680 	for (ia = in_ifaddr.tqh_first; ia; ia = ia->ia_list.tqe_next) {
681 		if ((ina.s_addr == ia->ia_addr.sin_addr.s_addr) ||
682 		    ((ia->ia_ifp->if_flags & (IFF_LOOPBACK|IFF_LINK1)) ==
683 			(IFF_LOOPBACK|IFF_LINK1) &&
684 		     ia->ia_subnet == (ina.s_addr & ia->ia_subnetmask)))
685 			return ia;
686 		if (((ip_directedbcast == 0) || (m && ip_directedbcast &&
687 		    ia->ia_ifp == m->m_pkthdr.rcvif)) &&
688 		    (ia->ia_ifp->if_flags & IFF_BROADCAST)) {
689 			if (ina.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
690 			    ina.s_addr == ia->ia_netbroadcast.s_addr ||
691 			    /*
692 			     * Look for all-0's host part (old broadcast addr),
693 			     * either for subnet or net.
694 			     */
695 			    ina.s_addr == ia->ia_subnet ||
696 			    ina.s_addr == ia->ia_net) {
697 				/* Make sure M_BCAST is set */
698 				if (m)
699 					m->m_flags |= M_BCAST;
700 				return ia;
701 			    }
702 		}
703 	}
704 
705 	return NULL;
706 }
707 
708 /*
709  * Take incoming datagram fragment and try to
710  * reassemble it into whole datagram.  If a chain for
711  * reassembly of this datagram already exists, then it
712  * is given as fp; otherwise have to make a chain.
713  */
714 struct mbuf *
715 ip_reass(ipqe, fp)
716 	struct ipqent *ipqe;
717 	struct ipq *fp;
718 {
719 	struct mbuf *m = ipqe->ipqe_m;
720 	struct ipqent *nq, *p, *q;
721 	struct ip *ip;
722 	struct mbuf *t;
723 	int hlen = ipqe->ipqe_ip->ip_hl << 2;
724 	int i, next;
725 
726 	/*
727 	 * Presence of header sizes in mbufs
728 	 * would confuse code below.
729 	 */
730 	m->m_data += hlen;
731 	m->m_len -= hlen;
732 
733 	/*
734 	 * If first fragment to arrive, create a reassembly queue.
735 	 */
736 	if (fp == 0) {
737 		MALLOC(fp, struct ipq *, sizeof (struct ipq),
738 		    M_FTABLE, M_NOWAIT);
739 		if (fp == NULL)
740 			goto dropfrag;
741 		LIST_INSERT_HEAD(&ipq, fp, ipq_q);
742 		fp->ipq_ttl = IPFRAGTTL;
743 		fp->ipq_p = ipqe->ipqe_ip->ip_p;
744 		fp->ipq_id = ipqe->ipqe_ip->ip_id;
745 		LIST_INIT(&fp->ipq_fragq);
746 		fp->ipq_src = ipqe->ipqe_ip->ip_src;
747 		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
748 		p = NULL;
749 		goto insert;
750 	}
751 
752 	/*
753 	 * Find a segment which begins after this one does.
754 	 */
755 	for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
756 	    p = q, q = q->ipqe_q.le_next)
757 		if (q->ipqe_ip->ip_off > ipqe->ipqe_ip->ip_off)
758 			break;
759 
760 	/*
761 	 * If there is a preceding segment, it may provide some of
762 	 * our data already.  If so, drop the data from the incoming
763 	 * segment.  If it provides all of our data, drop us.
764 	 */
765 	if (p != NULL) {
766 		i = p->ipqe_ip->ip_off + p->ipqe_ip->ip_len -
767 		    ipqe->ipqe_ip->ip_off;
768 		if (i > 0) {
769 			if (i >= ipqe->ipqe_ip->ip_len)
770 				goto dropfrag;
771 			m_adj(ipqe->ipqe_m, i);
772 			ipqe->ipqe_ip->ip_off += i;
773 			ipqe->ipqe_ip->ip_len -= i;
774 		}
775 	}
776 
777 	/*
778 	 * While we overlap succeeding segments trim them or,
779 	 * if they are completely covered, dequeue them.
780 	 */
781 	for (; q != NULL && ipqe->ipqe_ip->ip_off + ipqe->ipqe_ip->ip_len >
782 	    q->ipqe_ip->ip_off; q = nq) {
783 		i = (ipqe->ipqe_ip->ip_off + ipqe->ipqe_ip->ip_len) -
784 		    q->ipqe_ip->ip_off;
785 		if (i < q->ipqe_ip->ip_len) {
786 			q->ipqe_ip->ip_len -= i;
787 			q->ipqe_ip->ip_off += i;
788 			m_adj(q->ipqe_m, i);
789 			break;
790 		}
791 		nq = q->ipqe_q.le_next;
792 		m_freem(q->ipqe_m);
793 		LIST_REMOVE(q, ipqe_q);
794 		pool_put(&ipqent_pool, q);
795 		ip_frags--;
796 	}
797 
798 insert:
799 	/*
800 	 * Stick new segment in its place;
801 	 * check for complete reassembly.
802 	 */
803 	if (p == NULL) {
804 		LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
805 	} else {
806 		LIST_INSERT_AFTER(p, ipqe, ipqe_q);
807 	}
808 	next = 0;
809 	for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
810 	    p = q, q = q->ipqe_q.le_next) {
811 		if (q->ipqe_ip->ip_off != next)
812 			return (0);
813 		next += q->ipqe_ip->ip_len;
814 	}
815 	if (p->ipqe_mff)
816 		return (0);
817 
818 	/*
819 	 * Reassembly is complete.  Check for a bogus message size and
820 	 * concatenate fragments.
821 	 */
822 	q = fp->ipq_fragq.lh_first;
823 	ip = q->ipqe_ip;
824 	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
825 		ipstat.ips_toolong++;
826 		ip_freef(fp);
827 		return (0);
828 	}
829 	m = q->ipqe_m;
830 	t = m->m_next;
831 	m->m_next = 0;
832 	m_cat(m, t);
833 	nq = q->ipqe_q.le_next;
834 	pool_put(&ipqent_pool, q);
835 	ip_frags--;
836 	for (q = nq; q != NULL; q = nq) {
837 		t = q->ipqe_m;
838 		nq = q->ipqe_q.le_next;
839 		pool_put(&ipqent_pool, q);
840 		ip_frags--;
841 		m_cat(m, t);
842 	}
843 
844 	/*
845 	 * Create header for new ip packet by
846 	 * modifying header of first packet;
847 	 * dequeue and discard fragment reassembly header.
848 	 * Make header visible.
849 	 */
850 	ip->ip_len = next;
851 	ip->ip_src = fp->ipq_src;
852 	ip->ip_dst = fp->ipq_dst;
853 	LIST_REMOVE(fp, ipq_q);
854 	FREE(fp, M_FTABLE);
855 	m->m_len += (ip->ip_hl << 2);
856 	m->m_data -= (ip->ip_hl << 2);
857 	/* some debugging cruft by sklower, below, will go away soon */
858 	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
859 		int plen = 0;
860 		for (t = m; t; t = t->m_next)
861 			plen += t->m_len;
862 		m->m_pkthdr.len = plen;
863 	}
864 	return (m);
865 
866 dropfrag:
867 	ipstat.ips_fragdropped++;
868 	m_freem(m);
869 	pool_put(&ipqent_pool, ipqe);
870 	ip_frags--;
871 	return (0);
872 }
873 
874 /*
875  * Free a fragment reassembly header and all
876  * associated datagrams.
877  */
878 void
879 ip_freef(fp)
880 	struct ipq *fp;
881 {
882 	register struct ipqent *q, *p;
883 
884 	for (q = fp->ipq_fragq.lh_first; q != NULL; q = p) {
885 		p = q->ipqe_q.le_next;
886 		m_freem(q->ipqe_m);
887 		LIST_REMOVE(q, ipqe_q);
888 		pool_put(&ipqent_pool, q);
889 		ip_frags--;
890 	}
891 	LIST_REMOVE(fp, ipq_q);
892 	FREE(fp, M_FTABLE);
893 }
894 
895 /*
896  * IP timer processing;
897  * if a timer expires on a reassembly
898  * queue, discard it.
899  */
900 void
901 ip_slowtimo()
902 {
903 	register struct ipq *fp, *nfp;
904 	int s = splsoftnet();
905 
906 	ipq_lock();
907 	for (fp = ipq.lh_first; fp != NULL; fp = nfp) {
908 		nfp = fp->ipq_q.le_next;
909 		if (--fp->ipq_ttl == 0) {
910 			ipstat.ips_fragtimeout++;
911 			ip_freef(fp);
912 		}
913 	}
914 	ipq_unlock();
915 	splx(s);
916 }
917 
918 /*
919  * Drain off all datagram fragments.
920  */
921 void
922 ip_drain()
923 {
924 
925 	if (ipq_lock_try() == 0)
926 		return;
927 	while (ipq.lh_first != NULL) {
928 		ipstat.ips_fragdropped++;
929 		ip_freef(ipq.lh_first);
930 	}
931 	ipq_unlock();
932 }
933 
934 /*
935  * Flush a bunch of datagram fragments, till we are down to 75%.
936  */
937 void
938 ip_flush()
939 {
940 	int max = 50;
941 
942 	/* ipq already locked */
943 	while (ipq.lh_first != NULL && ip_frags > ip_maxqueue * 3 / 4 && --max) {
944 		ipstat.ips_fragdropped++;
945 		ip_freef(ipq.lh_first);
946 	}
947 }
948 
949 /*
950  * Do option processing on a datagram,
951  * possibly discarding it if bad options are encountered,
952  * or forwarding it if source-routed.
953  * Returns 1 if packet has been forwarded/freed,
954  * 0 if the packet should be processed further.
955  */
956 int
957 ip_dooptions(m)
958 	struct mbuf *m;
959 {
960 	register struct ip *ip = mtod(m, struct ip *);
961 	register u_char *cp;
962 	struct ip_timestamp ipt;
963 	register struct in_ifaddr *ia;
964 	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
965 	struct in_addr sin, dst;
966 	n_time ntime;
967 
968 	dst = ip->ip_dst;
969 	cp = (u_char *)(ip + 1);
970 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
971 
972 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
973 		opt = cp[IPOPT_OPTVAL];
974 		if (opt == IPOPT_EOL)
975 			break;
976 		if (opt == IPOPT_NOP)
977 			optlen = 1;
978 		else {
979 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
980 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
981 				goto bad;
982 			}
983 			optlen = cp[IPOPT_OLEN];
984 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
985 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
986 				goto bad;
987 			}
988 		}
989 
990 		switch (opt) {
991 
992 		default:
993 			break;
994 
995 		/*
996 		 * Source routing with record.
997 		 * Find interface with current destination address.
998 		 * If none on this machine then drop if strictly routed,
999 		 * or do nothing if loosely routed.
1000 		 * Record interface address and bring up next address
1001 		 * component.  If strictly routed make sure next
1002 		 * address is on directly accessible net.
1003 		 */
1004 		case IPOPT_LSRR:
1005 		case IPOPT_SSRR:
1006 			if (!ip_dosourceroute) {
1007 				char buf[4*sizeof "123"];
1008 
1009 				strcpy(buf, inet_ntoa(ip->ip_dst));
1010 				log(LOG_WARNING,
1011 				    "attempted source route from %s to %s\n",
1012 				    inet_ntoa(ip->ip_src), buf);
1013 				type = ICMP_UNREACH;
1014 				code = ICMP_UNREACH_SRCFAIL;
1015 				goto bad;
1016 			}
1017 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1018 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1019 				goto bad;
1020 			}
1021 			ipaddr.sin_addr = ip->ip_dst;
1022 			ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1023 			if (ia == 0) {
1024 				if (opt == IPOPT_SSRR) {
1025 					type = ICMP_UNREACH;
1026 					code = ICMP_UNREACH_SRCFAIL;
1027 					goto bad;
1028 				}
1029 				/*
1030 				 * Loose routing, and not at next destination
1031 				 * yet; nothing to do except forward.
1032 				 */
1033 				break;
1034 			}
1035 			off--;			/* 0 origin */
1036 			if ((off + sizeof(struct in_addr)) > optlen) {
1037 				/*
1038 				 * End of source route.  Should be for us.
1039 				 */
1040 				save_rte(cp, ip->ip_src);
1041 				break;
1042 			}
1043 
1044 			/*
1045 			 * locate outgoing interface
1046 			 */
1047 			bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr,
1048 			    sizeof(ipaddr.sin_addr));
1049 			if (opt == IPOPT_SSRR) {
1050 #define	INA	struct in_ifaddr *
1051 #define	SA	struct sockaddr *
1052 			    if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0)
1053 				ia = (INA)ifa_ifwithnet((SA)&ipaddr);
1054 			} else
1055 				ia = ip_rtaddr(ipaddr.sin_addr);
1056 			if (ia == 0) {
1057 				type = ICMP_UNREACH;
1058 				code = ICMP_UNREACH_SRCFAIL;
1059 				goto bad;
1060 			}
1061 			ip->ip_dst = ipaddr.sin_addr;
1062 			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1063 			    (caddr_t)(cp + off), sizeof(struct in_addr));
1064 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1065 			/*
1066 			 * Let ip_intr's mcast routing check handle mcast pkts
1067 			 */
1068 			forward = !IN_MULTICAST(ip->ip_dst.s_addr);
1069 			break;
1070 
1071 		case IPOPT_RR:
1072 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1073 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1074 				goto bad;
1075 			}
1076 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1077 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1078 				goto bad;
1079 			}
1080 
1081 			/*
1082 			 * If no space remains, ignore.
1083 			 */
1084 			off--;			/* 0 origin */
1085 			if ((off + sizeof(struct in_addr)) > optlen)
1086 				break;
1087 			bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr,
1088 			    sizeof(ipaddr.sin_addr));
1089 			/*
1090 			 * locate outgoing interface; if we're the destination,
1091 			 * use the incoming interface (should be same).
1092 			 */
1093 			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 &&
1094 			    (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) {
1095 				type = ICMP_UNREACH;
1096 				code = ICMP_UNREACH_HOST;
1097 				goto bad;
1098 			}
1099 			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1100 			    (caddr_t)(cp + off), sizeof(struct in_addr));
1101 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1102 			break;
1103 
1104 		case IPOPT_TS:
1105 			code = cp - (u_char *)ip;
1106 			bcopy(cp, &ipt, sizeof(struct ip_timestamp));
1107 			if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5)
1108 				goto bad;
1109 			if (ipt.ipt_ptr - 1 + sizeof(n_time) > ipt.ipt_len) {
1110 				if (++ipt.ipt_oflw == 0)
1111 					goto bad;
1112 				break;
1113 			}
1114 			bcopy(cp + ipt.ipt_ptr - 1, &sin, sizeof sin);
1115 			switch (ipt.ipt_flg) {
1116 
1117 			case IPOPT_TS_TSONLY:
1118 				break;
1119 
1120 			case IPOPT_TS_TSANDADDR:
1121 				if (ipt.ipt_ptr - 1 + sizeof(n_time) +
1122 				    sizeof(struct in_addr) > ipt.ipt_len)
1123 					goto bad;
1124 				ipaddr.sin_addr = dst;
1125 				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
1126 							    m->m_pkthdr.rcvif);
1127 				if (ia == 0)
1128 					continue;
1129 				bcopy((caddr_t)&ia->ia_addr.sin_addr,
1130 				    (caddr_t)&sin, sizeof(struct in_addr));
1131 				ipt.ipt_ptr += sizeof(struct in_addr);
1132 				break;
1133 
1134 			case IPOPT_TS_PRESPEC:
1135 				if (ipt.ipt_ptr - 1 + sizeof(n_time) +
1136 				    sizeof(struct in_addr) > ipt.ipt_len)
1137 					goto bad;
1138 				bcopy((caddr_t)&sin, (caddr_t)&ipaddr.sin_addr,
1139 				    sizeof(struct in_addr));
1140 				if (ifa_ifwithaddr((SA)&ipaddr) == 0)
1141 					continue;
1142 				ipt.ipt_ptr += sizeof(struct in_addr);
1143 				break;
1144 
1145 			default:
1146 				/* XXX can't take &ipt->ipt_flg */
1147 				code = (u_char *)&ipt.ipt_ptr -
1148 				    (u_char *)ip + 1;
1149 				goto bad;
1150 			}
1151 			ntime = iptime();
1152 			bcopy((caddr_t)&ntime, (caddr_t)cp + ipt.ipt_ptr - 1,
1153 			    sizeof(n_time));
1154 			ipt.ipt_ptr += sizeof(n_time);
1155 		}
1156 	}
1157 	if (forward && ipforwarding) {
1158 		ip_forward(m, 1);
1159 		return (1);
1160 	}
1161 	return (0);
1162 bad:
1163 	ip->ip_len -= ip->ip_hl << 2;   /* XXX icmp_error adds in hdr length */
1164 	icmp_error(m, type, code, 0, 0);
1165 	ipstat.ips_badoptions++;
1166 	return (1);
1167 }
1168 
1169 /*
1170  * Given address of next destination (final or next hop),
1171  * return internet address info of interface to be used to get there.
1172  */
1173 struct in_ifaddr *
1174 ip_rtaddr(dst)
1175 	 struct in_addr dst;
1176 {
1177 	register struct sockaddr_in *sin;
1178 
1179 	sin = satosin(&ipforward_rt.ro_dst);
1180 
1181 	if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) {
1182 		if (ipforward_rt.ro_rt) {
1183 			RTFREE(ipforward_rt.ro_rt);
1184 			ipforward_rt.ro_rt = 0;
1185 		}
1186 		sin->sin_family = AF_INET;
1187 		sin->sin_len = sizeof(*sin);
1188 		sin->sin_addr = dst;
1189 
1190 		rtalloc(&ipforward_rt);
1191 	}
1192 	if (ipforward_rt.ro_rt == 0)
1193 		return ((struct in_ifaddr *)0);
1194 	return (ifatoia(ipforward_rt.ro_rt->rt_ifa));
1195 }
1196 
1197 /*
1198  * Save incoming source route for use in replies,
1199  * to be picked up later by ip_srcroute if the receiver is interested.
1200  */
1201 void
1202 save_rte(option, dst)
1203 	u_char *option;
1204 	struct in_addr dst;
1205 {
1206 	unsigned olen;
1207 
1208 	olen = option[IPOPT_OLEN];
1209 #ifdef DIAGNOSTIC
1210 	if (ipprintfs)
1211 		printf("save_rte: olen %d\n", olen);
1212 #endif /* 0 */
1213 	if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1214 		return;
1215 	bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen);
1216 	ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1217 	ip_srcrt.dst = dst;
1218 }
1219 
1220 /*
1221  * Check whether we do proxy ARP for this address and we point to ourselves.
1222  * Code shamelessly copied from arplookup().
1223  */
1224 static int
1225 ip_weadvertise(addr)
1226 	u_int32_t addr;
1227 {
1228 	register struct rtentry *rt;
1229 	register struct ifnet *ifp;
1230 	register struct ifaddr *ifa;
1231 	struct sockaddr_inarp sin;
1232 
1233 	sin.sin_len = sizeof(sin);
1234 	sin.sin_family = AF_INET;
1235 	sin.sin_addr.s_addr = addr;
1236 	sin.sin_other = SIN_PROXY;
1237 	rt = rtalloc1(sintosa(&sin), 0);
1238 	if (rt == 0)
1239 		return 0;
1240 
1241 	RTFREE(rt);
1242 
1243 	if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 ||
1244 	    rt->rt_gateway->sa_family != AF_LINK) {
1245 		RTFREE(rt);
1246 		return 0;
1247 	}
1248 
1249 	for (ifp = ifnet.tqh_first; ifp != 0; ifp = ifp->if_list.tqe_next)
1250 		for (ifa = ifp->if_addrlist.tqh_first; ifa != 0;
1251 		    ifa = ifa->ifa_list.tqe_next) {
1252 			if (ifa->ifa_addr->sa_family != rt->rt_gateway->sa_family)
1253 				continue;
1254 
1255 			if (!bcmp(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
1256 			    LLADDR((struct sockaddr_dl *)rt->rt_gateway),
1257 			    ETHER_ADDR_LEN)) {
1258 				RTFREE(rt);
1259 				return 1;
1260 			}
1261 		}
1262 
1263 	RTFREE(rt);
1264 	return 0;
1265 }
1266 
1267 /*
1268  * Retrieve incoming source route for use in replies,
1269  * in the same form used by setsockopt.
1270  * The first hop is placed before the options, will be removed later.
1271  */
1272 struct mbuf *
1273 ip_srcroute()
1274 {
1275 	register struct in_addr *p, *q;
1276 	register struct mbuf *m;
1277 
1278 	if (ip_nhops == 0)
1279 		return ((struct mbuf *)0);
1280 	m = m_get(M_DONTWAIT, MT_SOOPTS);
1281 	if (m == 0)
1282 		return ((struct mbuf *)0);
1283 
1284 #define OPTSIZ	(sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1285 
1286 	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1287 	m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1288 	    OPTSIZ;
1289 #ifdef DIAGNOSTIC
1290 	if (ipprintfs)
1291 		printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1292 #endif
1293 
1294 	/*
1295 	 * First save first hop for return route
1296 	 */
1297 	p = &ip_srcrt.route[ip_nhops - 1];
1298 	*(mtod(m, struct in_addr *)) = *p--;
1299 #ifdef DIAGNOSTIC
1300 	if (ipprintfs)
1301 		printf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr));
1302 #endif
1303 
1304 	/*
1305 	 * Copy option fields and padding (nop) to mbuf.
1306 	 */
1307 	ip_srcrt.nop = IPOPT_NOP;
1308 	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1309 	bcopy((caddr_t)&ip_srcrt.nop,
1310 	    mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ);
1311 	q = (struct in_addr *)(mtod(m, caddr_t) +
1312 	    sizeof(struct in_addr) + OPTSIZ);
1313 #undef OPTSIZ
1314 	/*
1315 	 * Record return path as an IP source route,
1316 	 * reversing the path (pointers are now aligned).
1317 	 */
1318 	while (p >= ip_srcrt.route) {
1319 #ifdef DIAGNOSTIC
1320 		if (ipprintfs)
1321 			printf(" %x", ntohl(q->s_addr));
1322 #endif
1323 		*q++ = *p--;
1324 	}
1325 	/*
1326 	 * Last hop goes to final destination.
1327 	 */
1328 	*q = ip_srcrt.dst;
1329 #ifdef DIAGNOSTIC
1330 	if (ipprintfs)
1331 		printf(" %x\n", ntohl(q->s_addr));
1332 #endif
1333 	return (m);
1334 }
1335 
1336 /*
1337  * Strip out IP options, at higher
1338  * level protocol in the kernel.
1339  * Second argument is buffer to which options
1340  * will be moved, and return value is their length.
1341  * XXX should be deleted; last arg currently ignored.
1342  */
1343 void
1344 ip_stripoptions(m, mopt)
1345 	register struct mbuf *m;
1346 	struct mbuf *mopt;
1347 {
1348 	register int i;
1349 	struct ip *ip = mtod(m, struct ip *);
1350 	register caddr_t opts;
1351 	int olen;
1352 
1353 	olen = (ip->ip_hl<<2) - sizeof (struct ip);
1354 	opts = (caddr_t)(ip + 1);
1355 	i = m->m_len - (sizeof (struct ip) + olen);
1356 	bcopy(opts  + olen, opts, (unsigned)i);
1357 	m->m_len -= olen;
1358 	if (m->m_flags & M_PKTHDR)
1359 		m->m_pkthdr.len -= olen;
1360 	ip->ip_hl = sizeof(struct ip) >> 2;
1361 }
1362 
1363 int inetctlerrmap[PRC_NCMDS] = {
1364 	0,		0,		0,		0,
1365 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1366 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1367 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1368 	0,		0,		0,		0,
1369 	ENOPROTOOPT
1370 };
1371 
1372 /*
1373  * Forward a packet.  If some error occurs return the sender
1374  * an icmp packet.  Note we can't always generate a meaningful
1375  * icmp message because icmp doesn't have a large enough repertoire
1376  * of codes and types.
1377  *
1378  * If not forwarding, just drop the packet.  This could be confusing
1379  * if ipforwarding was zero but some routing protocol was advancing
1380  * us as a gateway to somewhere.  However, we must let the routing
1381  * protocol deal with that.
1382  *
1383  * The srcrt parameter indicates whether the packet is being forwarded
1384  * via a source route.
1385  */
1386 void
1387 ip_forward(m, srcrt)
1388 	struct mbuf *m;
1389 	int srcrt;
1390 {
1391 	register struct ip *ip = mtod(m, struct ip *);
1392 	register struct sockaddr_in *sin;
1393 	register struct rtentry *rt;
1394 	int error, type = 0, code = 0;
1395 	struct mbuf *mcopy;
1396 	n_long dest;
1397 	struct ifnet *destifp;
1398 #ifdef IPSEC
1399 	struct ifnet dummyifp;
1400 #endif
1401 
1402 	dest = 0;
1403 #ifdef DIAGNOSTIC
1404 	if (ipprintfs)
1405 		printf("forward: src %x dst %x ttl %x\n", ip->ip_src.s_addr,
1406 		    ip->ip_dst.s_addr, ip->ip_ttl);
1407 #endif
1408 	if (m->m_flags & M_BCAST || in_canforward(ip->ip_dst) == 0) {
1409 		ipstat.ips_cantforward++;
1410 		m_freem(m);
1411 		return;
1412 	}
1413 	if (ip->ip_ttl <= IPTTLDEC) {
1414 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
1415 		return;
1416 	}
1417 	ip->ip_ttl -= IPTTLDEC;
1418 
1419 	sin = satosin(&ipforward_rt.ro_dst);
1420 	if ((rt = ipforward_rt.ro_rt) == 0 ||
1421 	    ip->ip_dst.s_addr != sin->sin_addr.s_addr) {
1422 		if (ipforward_rt.ro_rt) {
1423 			RTFREE(ipforward_rt.ro_rt);
1424 			ipforward_rt.ro_rt = 0;
1425 		}
1426 		sin->sin_family = AF_INET;
1427 		sin->sin_len = sizeof(*sin);
1428 		sin->sin_addr = ip->ip_dst;
1429 
1430 		rtalloc(&ipforward_rt);
1431 		if (ipforward_rt.ro_rt == 0) {
1432 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
1433 			return;
1434 		}
1435 		rt = ipforward_rt.ro_rt;
1436 	}
1437 
1438 	/*
1439 	 * Save at most 68 bytes of the packet in case
1440 	 * we need to generate an ICMP message to the src.
1441 	 * Pullup to avoid sharing mbuf cluster between m and mcopy.
1442 	 */
1443 	mcopy = m_copym(m, 0, imin((int)ip->ip_len, 68), M_DONTWAIT);
1444 	if (mcopy)
1445 		mcopy = m_pullup(mcopy, ip->ip_hl << 2);
1446 
1447 	/*
1448 	 * If forwarding packet using same interface that it came in on,
1449 	 * perhaps should send a redirect to sender to shortcut a hop.
1450 	 * Only send redirect if source is sending directly to us,
1451 	 * and if packet was not source routed (or has any options).
1452 	 * Also, don't send redirect if forwarding using a default route
1453 	 * or a route modified by a redirect.
1454 	 * Don't send redirect if we advertise destination's arp address
1455 	 * as ours (proxy arp).
1456 	 */
1457 	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1458 	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1459 	    satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
1460 	    ipsendredirects && !srcrt &&
1461 	    !ip_weadvertise(satosin(rt_key(rt))->sin_addr.s_addr)) {
1462 		if (rt->rt_ifa &&
1463 		    (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) ==
1464 		    ifatoia(rt->rt_ifa)->ia_subnet) {
1465 		    if (rt->rt_flags & RTF_GATEWAY)
1466 			dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1467 		    else
1468 			dest = ip->ip_dst.s_addr;
1469 		    /* Router requirements says to only send host redirects */
1470 		    type = ICMP_REDIRECT;
1471 		    code = ICMP_REDIRECT_HOST;
1472 #ifdef DIAGNOSTIC
1473 		    if (ipprintfs)
1474 			printf("redirect (%d) to %x\n", code, (u_int32_t)dest);
1475 #endif
1476 		}
1477 	}
1478 
1479 #if 0 /*KAME IPSEC*/
1480 	m->m_pkthdr.rcvif = NULL;
1481 #endif /*IPSEC*/
1482 	error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
1483 	    (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
1484 	    0, NULL, NULL);
1485 	if (error)
1486 		ipstat.ips_cantforward++;
1487 	else {
1488 		ipstat.ips_forward++;
1489 		if (type)
1490 			ipstat.ips_redirectsent++;
1491 		else {
1492 			if (mcopy)
1493 				m_freem(mcopy);
1494 			return;
1495 		}
1496 	}
1497 	if (mcopy == NULL)
1498 		return;
1499 	destifp = NULL;
1500 
1501 	switch (error) {
1502 
1503 	case 0:				/* forwarded, but need redirect */
1504 		/* type, code set above */
1505 		break;
1506 
1507 	case ENETUNREACH:		/* shouldn't happen, checked above */
1508 	case EHOSTUNREACH:
1509 	case ENETDOWN:
1510 	case EHOSTDOWN:
1511 	default:
1512 		type = ICMP_UNREACH;
1513 		code = ICMP_UNREACH_HOST;
1514 		break;
1515 
1516 	case EMSGSIZE:
1517 		type = ICMP_UNREACH;
1518 		code = ICMP_UNREACH_NEEDFRAG;
1519 
1520 #ifdef IPSEC
1521 		if (ipforward_rt.ro_rt) {
1522 			struct rtentry *rt = ipforward_rt.ro_rt;
1523 			destifp = ipforward_rt.ro_rt->rt_ifp;
1524 			/*
1525 			 * XXX BUG ALERT
1526 			 * The "dummyifp" code relies upon the fact
1527 			 * that icmp_error() touches only ifp->if_mtu.
1528 			 */
1529 			if (rt->rt_rmx.rmx_mtu) {
1530 				dummyifp.if_mtu = rt->rt_rmx.rmx_mtu;
1531 				destifp = &dummyifp;
1532 			}
1533 		}
1534 #endif /*IPSEC*/
1535 		ipstat.ips_cantfrag++;
1536 		break;
1537 
1538 	case ENOBUFS:
1539 		type = ICMP_SOURCEQUENCH;
1540 		code = 0;
1541 		break;
1542 	}
1543 
1544 	icmp_error(mcopy, type, code, dest, destifp);
1545 }
1546 
1547 int
1548 ip_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
1549 	int *name;
1550 	u_int namelen;
1551 	void *oldp;
1552 	size_t *oldlenp;
1553 	void *newp;
1554 	size_t newlen;
1555 {
1556 	int error;
1557 
1558 	/* All sysctl names at this level are terminal. */
1559 	if (namelen != 1)
1560 		return (ENOTDIR);
1561 
1562 	switch (name[0]) {
1563 	case IPCTL_FORWARDING:
1564 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ipforwarding));
1565 	case IPCTL_SENDREDIRECTS:
1566 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1567 			&ipsendredirects));
1568 	case IPCTL_DEFTTL:
1569 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_defttl));
1570 #ifdef notyet
1571 	case IPCTL_DEFMTU:
1572 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu));
1573 #endif
1574 	case IPCTL_SOURCEROUTE:
1575 		/*
1576 		 * Don't allow this to change in a secure environment.
1577 		 */
1578 		if (newp && securelevel > 0)
1579 			return (EPERM);
1580 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1581 		    &ip_dosourceroute));
1582 	case IPCTL_DIRECTEDBCAST:
1583 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1584 		    &ip_directedbcast));
1585 	case IPCTL_MTUDISC:
1586 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1587 		    &ip_mtudisc);
1588 		if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) {
1589 			ip_mtudisc_timeout_q =
1590 			    rt_timer_queue_create(ip_mtudisc_timeout);
1591 		} else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) {
1592 			rt_timer_queue_destroy(ip_mtudisc_timeout_q, TRUE);
1593 			Free(ip_mtudisc_timeout_q);
1594 			ip_mtudisc_timeout_q = NULL;
1595 		}
1596 		return error;
1597 	case IPCTL_MTUDISCTIMEOUT:
1598 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1599 		   &ip_mtudisc_timeout);
1600 		if (ip_mtudisc_timeout_q != NULL)
1601 			rt_timer_queue_change(ip_mtudisc_timeout_q,
1602 					      ip_mtudisc_timeout);
1603 		return (error);
1604 	case IPCTL_IPPORT_FIRSTAUTO:
1605 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1606 		    &ipport_firstauto));
1607 	case IPCTL_IPPORT_LASTAUTO:
1608 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1609 		    &ipport_lastauto));
1610 	case IPCTL_IPPORT_HIFIRSTAUTO:
1611 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1612 		    &ipport_hifirstauto));
1613 	case IPCTL_IPPORT_HILASTAUTO:
1614 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1615 		    &ipport_hilastauto));
1616 	case IPCTL_IPPORT_MAXQUEUE:
1617 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1618 		    &ip_maxqueue));
1619 	case IPCTL_ENCDEBUG:
1620 		return (sysctl_int(oldp, oldlenp, newp, newlen, &encdebug));
1621 	case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT:
1622 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1623 				   &ipsec_keep_invalid));
1624 	case IPCTL_IPSEC_REQUIRE_PFS:
1625 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1626 				   &ipsec_require_pfs));
1627 	case IPCTL_IPSEC_SOFT_ALLOCATIONS:
1628 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1629 				   &ipsec_soft_allocations));
1630 	case IPCTL_IPSEC_ALLOCATIONS:
1631 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1632 				   &ipsec_exp_allocations));
1633 	case IPCTL_IPSEC_SOFT_BYTES:
1634 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1635 				   &ipsec_soft_bytes));
1636 	case IPCTL_IPSEC_BYTES:
1637 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1638 				   &ipsec_exp_bytes));
1639 	case IPCTL_IPSEC_TIMEOUT:
1640 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1641 				   &ipsec_exp_timeout));
1642 	case IPCTL_IPSEC_SOFT_TIMEOUT:
1643 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1644 				   &ipsec_soft_timeout));
1645 	case IPCTL_IPSEC_SOFT_FIRSTUSE:
1646 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1647 				   &ipsec_soft_first_use));
1648 	case IPCTL_IPSEC_FIRSTUSE:
1649 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1650 				   &ipsec_exp_first_use));
1651 	case IPCTL_IPSEC_ENC_ALGORITHM:
1652 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1653 				       ipsec_def_enc, sizeof(ipsec_def_enc)));
1654 	case IPCTL_IPSEC_AUTH_ALGORITHM:
1655 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1656 				       ipsec_def_auth,
1657 				       sizeof(ipsec_def_auth)));
1658 	case IPCTL_IPSEC_EXPIRE_ACQUIRE:
1659 	        return (sysctl_int(oldp, oldlenp, newp, newlen,
1660 				   &ipsec_expire_acquire));
1661 	case IPCTL_IPSEC_IPCOMP_ALGORITHM:
1662 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1663 				       ipsec_def_comp,
1664 				       sizeof(ipsec_def_comp)));
1665 	default:
1666 		return (EOPNOTSUPP);
1667 	}
1668 	/* NOTREACHED */
1669 }
1670