xref: /netbsd/sys/netinet/ip_mroute.c (revision bf9ec67e)
1 /*	$NetBSD: ip_mroute.c,v 1.59 2002/03/04 13:24:12 sommerfeld Exp $	*/
2 
3 /*
4  * IP multicast forwarding procedures
5  *
6  * Written by David Waitzman, BBN Labs, August 1988.
7  * Modified by Steve Deering, Stanford, February 1989.
8  * Modified by Mark J. Steiglitz, Stanford, May, 1991
9  * Modified by Van Jacobson, LBL, January 1993
10  * Modified by Ajit Thyagarajan, PARC, August 1993
11  * Modified by Bill Fenner, PARC, April 1994
12  * Modified by Charles M. Hannum, NetBSD, May 1995.
13  *
14  * MROUTING Revision: 1.2
15  */
16 
17 #include <sys/cdefs.h>
18 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.59 2002/03/04 13:24:12 sommerfeld Exp $");
19 
20 #include "opt_ipsec.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/callout.h>
25 #include <sys/mbuf.h>
26 #include <sys/socket.h>
27 #include <sys/socketvar.h>
28 #include <sys/protosw.h>
29 #include <sys/errno.h>
30 #include <sys/time.h>
31 #include <sys/kernel.h>
32 #include <sys/ioctl.h>
33 #include <sys/syslog.h>
34 #include <net/if.h>
35 #include <net/route.h>
36 #include <net/raw_cb.h>
37 #include <netinet/in.h>
38 #include <netinet/in_var.h>
39 #include <netinet/in_systm.h>
40 #include <netinet/ip.h>
41 #include <netinet/ip_var.h>
42 #include <netinet/in_pcb.h>
43 #include <netinet/udp.h>
44 #include <netinet/igmp.h>
45 #include <netinet/igmp_var.h>
46 #include <netinet/ip_mroute.h>
47 #include <netinet/ip_encap.h>
48 
49 #include <machine/stdarg.h>
50 
51 #define IP_MULTICASTOPTS 0
52 #define	M_PULLUP(m, len) \
53 	do { \
54 		if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
55 			(m) = m_pullup((m), (len)); \
56 	} while (0)
57 
58 /*
59  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
60  * except for netstat or debugging purposes.
61  */
62 struct socket  *ip_mrouter  = 0;
63 int		ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
64 
65 #define NO_RTE_FOUND 	0x1
66 #define RTE_FOUND	0x2
67 
68 #define	MFCHASH(a, g) \
69 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
70 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
71 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
72 u_long	mfchash;
73 
74 u_char		nexpire[MFCTBLSIZ];
75 struct vif	viftable[MAXVIFS];
76 struct mrtstat	mrtstat;
77 u_int		mrtdebug = 0;	  /* debug level 	*/
78 #define		DEBUG_MFC	0x02
79 #define		DEBUG_FORWARD	0x04
80 #define		DEBUG_EXPIRE	0x08
81 #define		DEBUG_XMIT	0x10
82 u_int       	tbfdebug = 0;     /* tbf debug level 	*/
83 #ifdef RSVP_ISI
84 u_int		rsvpdebug = 0;	  /* rsvp debug level   */
85 extern struct socket *ip_rsvpd;
86 extern int rsvp_on;
87 #endif /* RSVP_ISI */
88 
89 /* vif attachment using sys/netinet/ip_encap.c */
90 extern struct domain inetdomain;
91 static void vif_input __P((struct mbuf *, ...));
92 static int vif_encapcheck __P((const struct mbuf *, int, int, void *));
93 static struct protosw vif_protosw =
94 { SOCK_RAW,	&inetdomain,	IPPROTO_IPV4,	PR_ATOMIC|PR_ADDR,
95   vif_input,	rip_output,	0,		rip_ctloutput,
96   rip_usrreq,
97   0,            0,              0,              0,
98 };
99 
100 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
101 #define		UPCALL_EXPIRE	6		/* number of timeouts */
102 
103 /*
104  * Define the token bucket filter structures
105  */
106 
107 #define		TBF_REPROCESS	(hz / 100)	/* 100x / second */
108 
109 static int get_sg_cnt __P((struct sioc_sg_req *));
110 static int get_vif_cnt __P((struct sioc_vif_req *));
111 static int ip_mrouter_init __P((struct socket *, struct mbuf *));
112 static int get_version __P((struct mbuf *));
113 static int set_assert __P((struct mbuf *));
114 static int get_assert __P((struct mbuf *));
115 static int add_vif __P((struct mbuf *));
116 static int del_vif __P((struct mbuf *));
117 static void update_mfc __P((struct mfcctl *, struct mfc *));
118 static void expire_mfc __P((struct mfc *));
119 static int add_mfc __P((struct mbuf *));
120 #ifdef UPCALL_TIMING
121 static void collate __P((struct timeval *));
122 #endif
123 static int del_mfc __P((struct mbuf *));
124 static int socket_send __P((struct socket *, struct mbuf *,
125 			    struct sockaddr_in *));
126 static void expire_upcalls __P((void *));
127 #ifdef RSVP_ISI
128 static int ip_mdq __P((struct mbuf *, struct ifnet *, struct mfc *, vifi_t));
129 #else
130 static int ip_mdq __P((struct mbuf *, struct ifnet *, struct mfc *));
131 #endif
132 static void phyint_send __P((struct ip *, struct vif *, struct mbuf *));
133 static void encap_send __P((struct ip *, struct vif *, struct mbuf *));
134 static void tbf_control __P((struct vif *, struct mbuf *, struct ip *,
135 			     u_int32_t));
136 static void tbf_queue __P((struct vif *, struct mbuf *));
137 static void tbf_process_q __P((struct vif *));
138 static void tbf_reprocess_q __P((void *));
139 static int tbf_dq_sel __P((struct vif *, struct ip *));
140 static void tbf_send_packet __P((struct vif *, struct mbuf *));
141 static void tbf_update_tokens __P((struct vif *));
142 static int priority __P((struct vif *, struct ip *));
143 
144 /*
145  * 'Interfaces' associated with decapsulator (so we can tell
146  * packets that went through it from ones that get reflected
147  * by a broken gateway).  These interfaces are never linked into
148  * the system ifnet list & no routes point to them.  I.e., packets
149  * can't be sent this way.  They only exist as a placeholder for
150  * multicast source verification.
151  */
152 #if 0
153 struct ifnet multicast_decap_if[MAXVIFS];
154 #endif
155 
156 #define	ENCAP_TTL	64
157 #define	ENCAP_PROTO	IPPROTO_IPIP	/* 4 */
158 
159 /* prototype IP hdr for encapsulated packets */
160 struct ip multicast_encap_iphdr = {
161 #if BYTE_ORDER == LITTLE_ENDIAN
162 	sizeof(struct ip) >> 2, IPVERSION,
163 #else
164 	IPVERSION, sizeof(struct ip) >> 2,
165 #endif
166 	0,				/* tos */
167 	sizeof(struct ip),		/* total length */
168 	0,				/* id */
169 	0,				/* frag offset */
170 	ENCAP_TTL, ENCAP_PROTO,
171 	0,				/* checksum */
172 };
173 
174 /*
175  * Private variables.
176  */
177 static vifi_t	   numvifs = 0;
178 
179 static struct callout expire_upcalls_ch;
180 
181 /*
182  * one-back cache used by vif_encapcheck to locate a tunnel's vif
183  * given a datagram's src ip address.
184  */
185 static struct in_addr last_encap_src;
186 static struct vif *last_encap_vif;
187 
188 /*
189  * whether or not special PIM assert processing is enabled.
190  */
191 static int pim_assert;
192 /*
193  * Rate limit for assert notification messages, in usec
194  */
195 #define ASSERT_MSG_TIME		3000000
196 
197 /*
198  * Find a route for a given origin IP address and Multicast group address
199  * Type of service parameter to be added in the future!!!
200  */
201 
202 #define MFCFIND(o, g, rt) { \
203 	struct mfc *_rt; \
204 	(rt) = 0; \
205 	++mrtstat.mrts_mfc_lookups; \
206 	LIST_FOREACH(_rt, &mfchashtbl[MFCHASH(o, g)], mfc_hash) { \
207 		if (in_hosteq(_rt->mfc_origin, (o)) && \
208 		    in_hosteq(_rt->mfc_mcastgrp, (g)) && \
209 		    _rt->mfc_stall == 0) { \
210 			(rt) = _rt; \
211 			break; \
212 		} \
213 	} \
214 	if ((rt) == 0) \
215 		++mrtstat.mrts_mfc_misses; \
216 }
217 
218 /*
219  * Macros to compute elapsed time efficiently
220  * Borrowed from Van Jacobson's scheduling code
221  */
222 #define TV_DELTA(a, b, delta) { \
223 	int xxs; \
224 	delta = (a).tv_usec - (b).tv_usec; \
225 	xxs = (a).tv_sec - (b).tv_sec; \
226 	switch (xxs) { \
227 	case 2: \
228 		delta += 1000000; \
229 		/* fall through */ \
230 	case 1: \
231 		delta += 1000000; \
232 		/* fall through */ \
233 	case 0: \
234 		break; \
235 	default: \
236 		delta += (1000000 * xxs); \
237 		break; \
238 	} \
239 }
240 
241 #ifdef UPCALL_TIMING
242 u_int32_t upcall_data[51];
243 #endif /* UPCALL_TIMING */
244 
245 /*
246  * Handle MRT setsockopt commands to modify the multicast routing tables.
247  */
248 int
249 ip_mrouter_set(so, optname, m)
250 	struct socket *so;
251 	int optname;
252 	struct mbuf **m;
253 {
254 	int error;
255 
256 	if (optname != MRT_INIT && so != ip_mrouter)
257 		error = ENOPROTOOPT;
258 	else
259 		switch (optname) {
260 		case MRT_INIT:
261 			error = ip_mrouter_init(so, *m);
262 			break;
263 		case MRT_DONE:
264 			error = ip_mrouter_done();
265 			break;
266 		case MRT_ADD_VIF:
267 			error = add_vif(*m);
268 			break;
269 		case MRT_DEL_VIF:
270 			error = del_vif(*m);
271 			break;
272 		case MRT_ADD_MFC:
273 			error = add_mfc(*m);
274 			break;
275 		case MRT_DEL_MFC:
276 			error = del_mfc(*m);
277 			break;
278 		case MRT_ASSERT:
279 			error = set_assert(*m);
280 			break;
281 		default:
282 			error = ENOPROTOOPT;
283 			break;
284 		}
285 
286 	if (*m)
287 		m_free(*m);
288 	return (error);
289 }
290 
291 /*
292  * Handle MRT getsockopt commands
293  */
294 int
295 ip_mrouter_get(so, optname, m)
296 	struct socket *so;
297 	int optname;
298 	struct mbuf **m;
299 {
300 	int error;
301 
302 	if (so != ip_mrouter)
303 		error = ENOPROTOOPT;
304 	else {
305 		*m = m_get(M_WAIT, MT_SOOPTS);
306 
307 		switch (optname) {
308 		case MRT_VERSION:
309 			error = get_version(*m);
310 			break;
311 		case MRT_ASSERT:
312 			error = get_assert(*m);
313 			break;
314 		default:
315 			error = ENOPROTOOPT;
316 			break;
317 		}
318 
319 		if (error)
320 			m_free(*m);
321 	}
322 
323 	return (error);
324 }
325 
326 /*
327  * Handle ioctl commands to obtain information from the cache
328  */
329 int
330 mrt_ioctl(so, cmd, data)
331 	struct socket *so;
332 	u_long cmd;
333 	caddr_t data;
334 {
335 	int error;
336 
337 	if (so != ip_mrouter)
338 		error = EINVAL;
339 	else
340 		switch (cmd) {
341 		case SIOCGETVIFCNT:
342 			error = get_vif_cnt((struct sioc_vif_req *)data);
343 			break;
344 		case SIOCGETSGCNT:
345 			error = get_sg_cnt((struct sioc_sg_req *)data);
346 			break;
347 		default:
348 			error = EINVAL;
349 			break;
350 		}
351 
352 	return (error);
353 }
354 
355 /*
356  * returns the packet, byte, rpf-failure count for the source group provided
357  */
358 static int
359 get_sg_cnt(req)
360 	struct sioc_sg_req *req;
361 {
362 	struct mfc *rt;
363 	int s;
364 
365 	s = splsoftnet();
366 	MFCFIND(req->src, req->grp, rt);
367 	splx(s);
368 	if (rt != 0) {
369 		req->pktcnt = rt->mfc_pkt_cnt;
370 		req->bytecnt = rt->mfc_byte_cnt;
371 		req->wrong_if = rt->mfc_wrong_if;
372 	} else
373 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
374 
375 	return (0);
376 }
377 
378 /*
379  * returns the input and output packet and byte counts on the vif provided
380  */
381 static int
382 get_vif_cnt(req)
383 	struct sioc_vif_req *req;
384 {
385 	vifi_t vifi = req->vifi;
386 
387 	if (vifi >= numvifs)
388 		return (EINVAL);
389 
390 	req->icount = viftable[vifi].v_pkt_in;
391 	req->ocount = viftable[vifi].v_pkt_out;
392 	req->ibytes = viftable[vifi].v_bytes_in;
393 	req->obytes = viftable[vifi].v_bytes_out;
394 
395 	return (0);
396 }
397 
398 /*
399  * Enable multicast routing
400  */
401 static int
402 ip_mrouter_init(so, m)
403 	struct socket *so;
404 	struct mbuf *m;
405 {
406 	int *v;
407 
408 	if (mrtdebug)
409 		log(LOG_DEBUG,
410 		    "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
411 		    so->so_type, so->so_proto->pr_protocol);
412 
413 	if (so->so_type != SOCK_RAW ||
414 	    so->so_proto->pr_protocol != IPPROTO_IGMP)
415 		return (EOPNOTSUPP);
416 
417 	if (m == 0 || m->m_len < sizeof(int))
418 		return (EINVAL);
419 
420 	v = mtod(m, int *);
421 	if (*v != 1)
422 		return (EINVAL);
423 
424 	if (ip_mrouter != 0)
425 		return (EADDRINUSE);
426 
427 	ip_mrouter = so;
428 
429 	mfchashtbl =
430 	    hashinit(MFCTBLSIZ, HASH_LIST, M_MRTABLE, M_WAITOK, &mfchash);
431 	bzero((caddr_t)nexpire, sizeof(nexpire));
432 
433 	pim_assert = 0;
434 
435 	callout_init(&expire_upcalls_ch);
436 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
437 	    expire_upcalls, NULL);
438 
439 	if (mrtdebug)
440 		log(LOG_DEBUG, "ip_mrouter_init\n");
441 
442 	return (0);
443 }
444 
445 /*
446  * Disable multicast routing
447  */
448 int
449 ip_mrouter_done()
450 {
451 	vifi_t vifi;
452 	struct vif *vifp;
453 	int i;
454 	int s;
455 
456 	s = splsoftnet();
457 
458 	/* Clear out all the vifs currently in use. */
459 	for (vifi = 0; vifi < numvifs; vifi++) {
460 		vifp = &viftable[vifi];
461 		if (!in_nullhost(vifp->v_lcl_addr))
462 			reset_vif(vifp);
463 	}
464 
465 	numvifs = 0;
466 	pim_assert = 0;
467 
468 	callout_stop(&expire_upcalls_ch);
469 
470 	/*
471 	 * Free all multicast forwarding cache entries.
472 	 */
473 	for (i = 0; i < MFCTBLSIZ; i++) {
474 		struct mfc *rt, *nrt;
475 
476 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
477 			nrt = LIST_NEXT(rt, mfc_hash);
478 
479 			expire_mfc(rt);
480 		}
481 	}
482 
483 	free(mfchashtbl, M_MRTABLE);
484 	mfchashtbl = 0;
485 
486 	/* Reset de-encapsulation cache. */
487 
488 	ip_mrouter = 0;
489 
490 	splx(s);
491 
492 	if (mrtdebug)
493 		log(LOG_DEBUG, "ip_mrouter_done\n");
494 
495 	return (0);
496 }
497 
498 static int
499 get_version(m)
500 	struct mbuf *m;
501 {
502 	int *v = mtod(m, int *);
503 
504 	*v = 0x0305;	/* XXX !!!! */
505 	m->m_len = sizeof(int);
506 	return (0);
507 }
508 
509 /*
510  * Set PIM assert processing global
511  */
512 static int
513 set_assert(m)
514 	struct mbuf *m;
515 {
516 	int *i;
517 
518 	if (m == 0 || m->m_len < sizeof(int))
519 		return (EINVAL);
520 
521 	i = mtod(m, int *);
522 	pim_assert = !!*i;
523 	return (0);
524 }
525 
526 /*
527  * Get PIM assert processing global
528  */
529 static int
530 get_assert(m)
531 	struct mbuf *m;
532 {
533 	int *i = mtod(m, int *);
534 
535 	*i = pim_assert;
536 	m->m_len = sizeof(int);
537 	return (0);
538 }
539 
540 static struct sockaddr_in sin = { sizeof(sin), AF_INET };
541 
542 /*
543  * Add a vif to the vif table
544  */
545 static int
546 add_vif(m)
547 	struct mbuf *m;
548 {
549 	struct vifctl *vifcp;
550 	struct vif *vifp;
551 	struct ifaddr *ifa;
552 	struct ifnet *ifp;
553 	struct ifreq ifr;
554 	int error, s;
555 
556 	if (m == 0 || m->m_len < sizeof(struct vifctl))
557 		return (EINVAL);
558 
559 	vifcp = mtod(m, struct vifctl *);
560 	if (vifcp->vifc_vifi >= MAXVIFS)
561 		return (EINVAL);
562 
563 	vifp = &viftable[vifcp->vifc_vifi];
564 	if (!in_nullhost(vifp->v_lcl_addr))
565 		return (EADDRINUSE);
566 
567 	/* Find the interface with an address in AF_INET family. */
568 	sin.sin_addr = vifcp->vifc_lcl_addr;
569 	ifa = ifa_ifwithaddr(sintosa(&sin));
570 	if (ifa == 0)
571 		return (EADDRNOTAVAIL);
572 
573 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
574 		if (vifcp->vifc_flags & VIFF_SRCRT) {
575 			log(LOG_ERR, "Source routed tunnels not supported\n");
576 			return (EOPNOTSUPP);
577 		}
578 
579 		/* attach this vif to decapsulator dispatch table */
580 		vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
581 		    vif_encapcheck, &vif_protosw, vifp);
582 		if (!vifp->v_encap_cookie)
583 			return (EINVAL);
584 
585 		/* Create a fake encapsulation interface. */
586 		ifp = (struct ifnet *)malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK);
587 		bzero(ifp, sizeof(*ifp));
588 		sprintf(ifp->if_xname, "mdecap%d", vifcp->vifc_vifi);
589 
590 		/* Prepare cached route entry. */
591 		bzero(&vifp->v_route, sizeof(vifp->v_route));
592 	} else {
593 		/* Use the physical interface associated with the address. */
594 		ifp = ifa->ifa_ifp;
595 
596 		/* Make sure the interface supports multicast. */
597 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
598 			return (EOPNOTSUPP);
599 
600 		/* Enable promiscuous reception of all IP multicasts. */
601 		satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
602 		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
603 		satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
604 		error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
605 		if (error)
606 			return (error);
607 	}
608 
609 	s = splsoftnet();
610 
611 	/* Define parameters for the tbf structure. */
612 	vifp->tbf_q = 0;
613 	vifp->tbf_t = &vifp->tbf_q;
614 	microtime(&vifp->tbf_last_pkt_t);
615 	vifp->tbf_n_tok = 0;
616 	vifp->tbf_q_len = 0;
617 	vifp->tbf_max_q_len = MAXQSIZE;
618 
619 	vifp->v_flags = vifcp->vifc_flags;
620 	vifp->v_threshold = vifcp->vifc_threshold;
621 	/* scaling up here allows division by 1024 in critical code */
622 	vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
623 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
624 	vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
625 	vifp->v_ifp = ifp;
626 	/* Initialize per vif pkt counters. */
627 	vifp->v_pkt_in = 0;
628 	vifp->v_pkt_out = 0;
629 	vifp->v_bytes_in = 0;
630 	vifp->v_bytes_out = 0;
631 
632 	callout_init(&vifp->v_repq_ch);
633 
634 #ifdef RSVP_ISI
635 	vifp->v_rsvp_on = 0;
636 	vifp->v_rsvpd = 0;
637 #endif /* RSVP_ISI */
638 
639 	splx(s);
640 
641 	/* Adjust numvifs up if the vifi is higher than numvifs. */
642 	if (numvifs <= vifcp->vifc_vifi)
643 		numvifs = vifcp->vifc_vifi + 1;
644 
645 	if (mrtdebug)
646 		log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
647 		    vifcp->vifc_vifi,
648 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
649 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
650 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
651 		    vifcp->vifc_threshold,
652 		    vifcp->vifc_rate_limit);
653 
654 	return (0);
655 }
656 
657 void
658 reset_vif(vifp)
659 	struct vif *vifp;
660 {
661 	struct mbuf *m, *n;
662 	struct ifnet *ifp;
663 	struct ifreq ifr;
664 
665 	callout_stop(&vifp->v_repq_ch);
666 
667 	/* detach this vif from decapsulator dispatch table */
668 	encap_detach(vifp->v_encap_cookie);
669 	vifp->v_encap_cookie = NULL;
670 
671 	for (m = vifp->tbf_q; m != 0; m = n) {
672 		n = m->m_nextpkt;
673 		m_freem(m);
674 	}
675 
676 	if (vifp->v_flags & VIFF_TUNNEL) {
677 		free(vifp->v_ifp, M_MRTABLE);
678 		if (vifp == last_encap_vif) {
679 			last_encap_vif = 0;
680 			last_encap_src = zeroin_addr;
681 		}
682 	} else {
683 		satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
684 		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
685 		satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
686 		ifp = vifp->v_ifp;
687 		(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
688 	}
689 	bzero((caddr_t)vifp, sizeof(*vifp));
690 }
691 
692 /*
693  * Delete a vif from the vif table
694  */
695 static int
696 del_vif(m)
697 	struct mbuf *m;
698 {
699 	vifi_t *vifip;
700 	struct vif *vifp;
701 	vifi_t vifi;
702 	int s;
703 
704 	if (m == 0 || m->m_len < sizeof(vifi_t))
705 		return (EINVAL);
706 
707 	vifip = mtod(m, vifi_t *);
708 	if (*vifip >= numvifs)
709 		return (EINVAL);
710 
711 	vifp = &viftable[*vifip];
712 	if (in_nullhost(vifp->v_lcl_addr))
713 		return (EADDRNOTAVAIL);
714 
715 	s = splsoftnet();
716 
717 	reset_vif(vifp);
718 
719 	/* Adjust numvifs down */
720 	for (vifi = numvifs; vifi > 0; vifi--)
721 		if (!in_nullhost(viftable[vifi-1].v_lcl_addr))
722 			break;
723 	numvifs = vifi;
724 
725 	splx(s);
726 
727 	if (mrtdebug)
728 		log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
729 
730 	return (0);
731 }
732 
733 static void
734 update_mfc(mfccp, rt)
735 	struct mfcctl *mfccp;
736 	struct mfc *rt;
737 {
738 	vifi_t vifi;
739 
740 	rt->mfc_parent = mfccp->mfcc_parent;
741 	for (vifi = 0; vifi < numvifs; vifi++)
742 		rt->mfc_ttls[vifi] = mfccp->mfcc_ttls[vifi];
743 	rt->mfc_expire = 0;
744 	rt->mfc_stall = 0;
745 }
746 
747 static void
748 expire_mfc(rt)
749 	struct mfc *rt;
750 {
751 	struct rtdetq *rte, *nrte;
752 
753 	for (rte = rt->mfc_stall; rte != 0; rte = nrte) {
754 		nrte = rte->next;
755 		m_freem(rte->m);
756 		free(rte, M_MRTABLE);
757 	}
758 
759 	LIST_REMOVE(rt, mfc_hash);
760 	free(rt, M_MRTABLE);
761 }
762 
763 /*
764  * Add an mfc entry
765  */
766 static int
767 add_mfc(m)
768 	struct mbuf *m;
769 {
770 	struct mfcctl *mfccp;
771 	struct mfc *rt;
772 	u_int32_t hash = 0;
773 	struct rtdetq *rte, *nrte;
774 	u_short nstl;
775 	int s;
776 
777 	if (m == 0 || m->m_len < sizeof(struct mfcctl))
778 		return (EINVAL);
779 
780 	mfccp = mtod(m, struct mfcctl *);
781 
782 	s = splsoftnet();
783 	MFCFIND(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp, rt);
784 
785 	/* If an entry already exists, just update the fields */
786 	if (rt) {
787 		if (mrtdebug & DEBUG_MFC)
788 			log(LOG_DEBUG,"add_mfc update o %x g %x p %x\n",
789 			    ntohl(mfccp->mfcc_origin.s_addr),
790 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
791 			    mfccp->mfcc_parent);
792 
793 		if (rt->mfc_expire)
794 			nexpire[hash]--;
795 
796 		update_mfc(mfccp, rt);
797 
798 		splx(s);
799 		return (0);
800 	}
801 
802 	/*
803 	 * Find the entry for which the upcall was made and update
804 	 */
805 	nstl = 0;
806 	hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
807 	LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
808 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
809 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
810 		    rt->mfc_stall != 0) {
811 			if (nstl++)
812 				log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
813 				    "multiple kernel entries",
814 				    ntohl(mfccp->mfcc_origin.s_addr),
815 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
816 				    mfccp->mfcc_parent, rt->mfc_stall);
817 
818 			if (mrtdebug & DEBUG_MFC)
819 				log(LOG_DEBUG,"add_mfc o %x g %x p %x dbg %p\n",
820 				    ntohl(mfccp->mfcc_origin.s_addr),
821 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
822 				    mfccp->mfcc_parent, rt->mfc_stall);
823 
824 			if (rt->mfc_expire)
825 				nexpire[hash]--;
826 
827 			rte = rt->mfc_stall;
828 			update_mfc(mfccp, rt);
829 
830 			/* free packets Qed at the end of this entry */
831 			for (; rte != 0; rte = nrte) {
832 				nrte = rte->next;
833 #ifdef RSVP_ISI
834 				ip_mdq(rte->m, rte->ifp, rt, -1);
835 #else
836 				ip_mdq(rte->m, rte->ifp, rt);
837 #endif /* RSVP_ISI */
838 				m_freem(rte->m);
839 #ifdef UPCALL_TIMING
840 				collate(&rte->t);
841 #endif /* UPCALL_TIMING */
842 				free(rte, M_MRTABLE);
843 			}
844 		}
845 	}
846 
847 	if (nstl == 0) {
848 		/*
849 		 * No mfc; make a new one
850 		 */
851 		if (mrtdebug & DEBUG_MFC)
852 			log(LOG_DEBUG,"add_mfc no upcall o %x g %x p %x\n",
853 			    ntohl(mfccp->mfcc_origin.s_addr),
854 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
855 			    mfccp->mfcc_parent);
856 
857 		rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
858 		if (rt == 0) {
859 			splx(s);
860 			return (ENOBUFS);
861 		}
862 
863 		rt->mfc_origin = mfccp->mfcc_origin;
864 		rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
865 		/* initialize pkt counters per src-grp */
866 		rt->mfc_pkt_cnt = 0;
867 		rt->mfc_byte_cnt = 0;
868 		rt->mfc_wrong_if = 0;
869 		timerclear(&rt->mfc_last_assert);
870 		update_mfc(mfccp, rt);
871 
872 		/* insert new entry at head of hash chain */
873 		LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
874 	}
875 
876 	splx(s);
877 	return (0);
878 }
879 
880 #ifdef UPCALL_TIMING
881 /*
882  * collect delay statistics on the upcalls
883  */
884 static void collate(t)
885 struct timeval *t;
886 {
887     u_int32_t d;
888     struct timeval tp;
889     u_int32_t delta;
890 
891     microtime(&tp);
892 
893     if (timercmp(t, &tp, <)) {
894 	TV_DELTA(tp, *t, delta);
895 
896 	d = delta >> 10;
897 	if (d > 50)
898 	    d = 50;
899 
900 	++upcall_data[d];
901     }
902 }
903 #endif /* UPCALL_TIMING */
904 
905 /*
906  * Delete an mfc entry
907  */
908 static int
909 del_mfc(m)
910 	struct mbuf *m;
911 {
912 	struct mfcctl *mfccp;
913 	struct mfc *rt;
914 	int s;
915 
916 	if (m == 0 || m->m_len < sizeof(struct mfcctl))
917 		return (EINVAL);
918 
919 	mfccp = mtod(m, struct mfcctl *);
920 
921 	if (mrtdebug & DEBUG_MFC)
922 		log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
923 		    ntohl(mfccp->mfcc_origin.s_addr),
924 		    ntohl(mfccp->mfcc_mcastgrp.s_addr));
925 
926 	s = splsoftnet();
927 
928 	MFCFIND(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp, rt);
929 	if (rt == 0) {
930 		splx(s);
931 		return (EADDRNOTAVAIL);
932 	}
933 
934 	LIST_REMOVE(rt, mfc_hash);
935 	free(rt, M_MRTABLE);
936 
937 	splx(s);
938 	return (0);
939 }
940 
941 static int
942 socket_send(s, mm, src)
943     struct socket *s;
944     struct mbuf *mm;
945     struct sockaddr_in *src;
946 {
947     if (s) {
948 	if (sbappendaddr(&s->so_rcv, sintosa(src), mm, (struct mbuf *)0) != 0) {
949 	    sorwakeup(s);
950 	    return (0);
951 	}
952     }
953     m_freem(mm);
954     return (-1);
955 }
956 
957 /*
958  * IP multicast forwarding function. This function assumes that the packet
959  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
960  * pointed to by "ifp", and the packet is to be relayed to other networks
961  * that have members of the packet's destination IP multicast group.
962  *
963  * The packet is returned unscathed to the caller, unless it is
964  * erroneous, in which case a non-zero return value tells the caller to
965  * discard it.
966  */
967 
968 #define IP_HDR_LEN  20	/* # bytes of fixed IP header (excluding options) */
969 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
970 
971 int
972 #ifdef RSVP_ISI
973 ip_mforward(m, ifp, imo)
974 #else
975 ip_mforward(m, ifp)
976 #endif /* RSVP_ISI */
977     struct mbuf *m;
978     struct ifnet *ifp;
979 #ifdef RSVP_ISI
980     struct ip_moptions *imo;
981 #endif /* RSVP_ISI */
982 {
983     struct ip *ip = mtod(m, struct ip *);
984     struct mfc *rt;
985     u_char *ipoptions;
986     static int srctun = 0;
987     struct mbuf *mm;
988     int s;
989 #ifdef RSVP_ISI
990     struct vif *vifp;
991     vifi_t vifi;
992 #endif /* RSVP_ISI */
993 
994     /*
995      * Clear any in-bound checksum flags for this packet.
996      */
997     m->m_pkthdr.csum_flags = 0;
998 
999     if (mrtdebug & DEBUG_FORWARD)
1000 	log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
1001 	    ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
1002 
1003     if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
1004 	(ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1005 	/*
1006 	 * Packet arrived via a physical interface or
1007 	 * an encapuslated tunnel.
1008 	 */
1009     } else {
1010 	/*
1011 	 * Packet arrived through a source-route tunnel.
1012 	 * Source-route tunnels are no longer supported.
1013 	 */
1014 	if ((srctun++ % 1000) == 0)
1015 	    log(LOG_ERR, "ip_mforward: received source-routed packet from %x\n",
1016 		ntohl(ip->ip_src.s_addr));
1017 
1018 	return (1);
1019     }
1020 
1021 #ifdef RSVP_ISI
1022     if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
1023 	if (ip->ip_ttl < 255)
1024 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
1025 	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
1026 	    vifp = viftable + vifi;
1027 	    printf("Sending IPPROTO_RSVP from %x to %x on vif %d (%s%s)\n",
1028 		ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi,
1029 		(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
1030 		vifp->v_ifp->if_xname);
1031 	}
1032 	return (ip_mdq(m, ifp, (struct mfc *)0, vifi));
1033     }
1034     if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
1035 	printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n",
1036 	    ntohl(ip->ip_src), ntohl(ip->ip_dst));
1037     }
1038 #endif /* RSVP_ISI */
1039 
1040     /*
1041      * Don't forward a packet with time-to-live of zero or one,
1042      * or a packet destined to a local-only group.
1043      */
1044     if (ip->ip_ttl <= 1 ||
1045 	IN_LOCAL_GROUP(ip->ip_dst.s_addr))
1046 	return (0);
1047 
1048     /*
1049      * Determine forwarding vifs from the forwarding cache table
1050      */
1051     s = splsoftnet();
1052     MFCFIND(ip->ip_src, ip->ip_dst, rt);
1053 
1054     /* Entry exists, so forward if necessary */
1055     if (rt != 0) {
1056 	splx(s);
1057 #ifdef RSVP_ISI
1058 	return (ip_mdq(m, ifp, rt, -1));
1059 #else
1060 	return (ip_mdq(m, ifp, rt));
1061 #endif /* RSVP_ISI */
1062     } else {
1063 	/*
1064 	 * If we don't have a route for packet's origin,
1065 	 * Make a copy of the packet &
1066 	 * send message to routing daemon
1067 	 */
1068 
1069 	struct mbuf *mb0;
1070 	struct rtdetq *rte;
1071 	u_int32_t hash;
1072 	int hlen = ip->ip_hl << 2;
1073 #ifdef UPCALL_TIMING
1074 	struct timeval tp;
1075 
1076 	microtime(&tp);
1077 #endif /* UPCALL_TIMING */
1078 
1079 	mrtstat.mrts_no_route++;
1080 	if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1081 	    log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
1082 		ntohl(ip->ip_src.s_addr),
1083 		ntohl(ip->ip_dst.s_addr));
1084 
1085 	/*
1086 	 * Allocate mbufs early so that we don't do extra work if we are
1087 	 * just going to fail anyway.  Make sure to pullup the header so
1088 	 * that other people can't step on it.
1089 	 */
1090 	rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
1091 	if (rte == 0) {
1092 	    splx(s);
1093 	    return (ENOBUFS);
1094 	}
1095 	mb0 = m_copy(m, 0, M_COPYALL);
1096 	M_PULLUP(mb0, hlen);
1097 	if (mb0 == 0) {
1098 	    free(rte, M_MRTABLE);
1099 	    splx(s);
1100 	    return (ENOBUFS);
1101 	}
1102 
1103 	/* is there an upcall waiting for this packet? */
1104 	hash = MFCHASH(ip->ip_src, ip->ip_dst);
1105 	LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1106 	    if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1107 		in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1108 		rt->mfc_stall != 0)
1109 		break;
1110 	}
1111 
1112 	if (rt == 0) {
1113 	    int i;
1114 	    struct igmpmsg *im;
1115 
1116 	    /* no upcall, so make a new entry */
1117 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1118 	    if (rt == 0) {
1119 		free(rte, M_MRTABLE);
1120 		m_freem(mb0);
1121 		splx(s);
1122 		return (ENOBUFS);
1123 	    }
1124 	    /* Make a copy of the header to send to the user level process */
1125 	    mm = m_copy(m, 0, hlen);
1126 	    M_PULLUP(mm, hlen);
1127 	    if (mm == 0) {
1128 		free(rte, M_MRTABLE);
1129 		m_freem(mb0);
1130 		free(rt, M_MRTABLE);
1131 		splx(s);
1132 		return (ENOBUFS);
1133 	    }
1134 
1135 	    /*
1136 	     * Send message to routing daemon to install
1137 	     * a route into the kernel table
1138 	     */
1139 	    sin.sin_addr = ip->ip_src;
1140 
1141 	    im = mtod(mm, struct igmpmsg *);
1142 	    im->im_msgtype	= IGMPMSG_NOCACHE;
1143 	    im->im_mbz		= 0;
1144 
1145 	    mrtstat.mrts_upcalls++;
1146 
1147 	    if (socket_send(ip_mrouter, mm, &sin) < 0) {
1148 		log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
1149 		++mrtstat.mrts_upq_sockfull;
1150 		free(rte, M_MRTABLE);
1151 		m_freem(mb0);
1152 		free(rt, M_MRTABLE);
1153 		splx(s);
1154 		return (ENOBUFS);
1155 	    }
1156 
1157 	    /* insert new entry at head of hash chain */
1158 	    rt->mfc_origin = ip->ip_src;
1159 	    rt->mfc_mcastgrp = ip->ip_dst;
1160 	    rt->mfc_pkt_cnt = 0;
1161 	    rt->mfc_byte_cnt = 0;
1162 	    rt->mfc_wrong_if = 0;
1163 	    rt->mfc_expire = UPCALL_EXPIRE;
1164 	    nexpire[hash]++;
1165 	    for (i = 0; i < numvifs; i++)
1166 		rt->mfc_ttls[i] = 0;
1167 	    rt->mfc_parent = -1;
1168 
1169 	    /* link into table */
1170 	    LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1171 	    /* Add this entry to the end of the queue */
1172 	    rt->mfc_stall = rte;
1173 	} else {
1174 	    /* determine if q has overflowed */
1175 	    struct rtdetq **p;
1176 	    int npkts = 0;
1177 
1178 	    for (p = &rt->mfc_stall; *p != 0; p = &(*p)->next)
1179 		if (++npkts > MAX_UPQ) {
1180 		    mrtstat.mrts_upq_ovflw++;
1181 		    free(rte, M_MRTABLE);
1182 		    m_freem(mb0);
1183 		    splx(s);
1184 		    return (0);
1185 	        }
1186 
1187 	    /* Add this entry to the end of the queue */
1188 	    *p = rte;
1189 	}
1190 
1191 	rte->next		= 0;
1192 	rte->m 			= mb0;
1193 	rte->ifp 		= ifp;
1194 #ifdef UPCALL_TIMING
1195 	rte->t			= tp;
1196 #endif /* UPCALL_TIMING */
1197 
1198 
1199 	splx(s);
1200 
1201 	return (0);
1202     }
1203 }
1204 
1205 
1206 /*ARGSUSED*/
1207 static void
1208 expire_upcalls(v)
1209 	void *v;
1210 {
1211 	int i;
1212 	int s;
1213 
1214 	s = splsoftnet();
1215 
1216 	for (i = 0; i < MFCTBLSIZ; i++) {
1217 		struct mfc *rt, *nrt;
1218 
1219 		if (nexpire[i] == 0)
1220 			continue;
1221 
1222 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
1223 			nrt = LIST_NEXT(rt, mfc_hash);
1224 
1225 			if (rt->mfc_expire == 0 ||
1226 			    --rt->mfc_expire > 0)
1227 				continue;
1228 			nexpire[i]--;
1229 
1230 			++mrtstat.mrts_cache_cleanups;
1231 			if (mrtdebug & DEBUG_EXPIRE)
1232 				log(LOG_DEBUG,
1233 				    "expire_upcalls: expiring (%x %x)\n",
1234 				    ntohl(rt->mfc_origin.s_addr),
1235 				    ntohl(rt->mfc_mcastgrp.s_addr));
1236 
1237 			expire_mfc(rt);
1238 		}
1239 	}
1240 
1241 	splx(s);
1242 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
1243 	    expire_upcalls, NULL);
1244 }
1245 
1246 /*
1247  * Packet forwarding routine once entry in the cache is made
1248  */
1249 static int
1250 #ifdef RSVP_ISI
1251 ip_mdq(m, ifp, rt, xmt_vif)
1252 #else
1253 ip_mdq(m, ifp, rt)
1254 #endif /* RSVP_ISI */
1255     struct mbuf *m;
1256     struct ifnet *ifp;
1257     struct mfc *rt;
1258 #ifdef RSVP_ISI
1259     vifi_t xmt_vif;
1260 #endif /* RSVP_ISI */
1261 {
1262     struct ip  *ip = mtod(m, struct ip *);
1263     vifi_t vifi;
1264     struct vif *vifp;
1265     int plen = ntohs(ip->ip_len);
1266 
1267 /*
1268  * Macro to send packet on vif.  Since RSVP packets don't get counted on
1269  * input, they shouldn't get counted on output, so statistics keeping is
1270  * separate.
1271  */
1272 #define MC_SEND(ip,vifp,m) {                             \
1273                 if ((vifp)->v_flags & VIFF_TUNNEL)	 \
1274                     encap_send((ip), (vifp), (m));       \
1275                 else                                     \
1276                     phyint_send((ip), (vifp), (m));      \
1277 }
1278 
1279 #ifdef RSVP_ISI
1280     /*
1281      * If xmt_vif is not -1, send on only the requested vif.
1282      *
1283      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.
1284      */
1285     if (xmt_vif < numvifs) {
1286         MC_SEND(ip, viftable + xmt_vif, m);
1287 	return (1);
1288     }
1289 #endif /* RSVP_ISI */
1290 
1291     /*
1292      * Don't forward if it didn't arrive from the parent vif for its origin.
1293      */
1294     vifi = rt->mfc_parent;
1295     if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1296 	/* came in the wrong interface */
1297 	if (mrtdebug & DEBUG_FORWARD)
1298 	    log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1299 		ifp, vifi, viftable[vifi].v_ifp);
1300 	++mrtstat.mrts_wrong_if;
1301 	++rt->mfc_wrong_if;
1302 	/*
1303 	 * If we are doing PIM assert processing, and we are forwarding
1304 	 * packets on this interface, and it is a broadcast medium
1305 	 * interface (and not a tunnel), send a message to the routing daemon.
1306 	 */
1307 	if (pim_assert && rt->mfc_ttls[vifi] &&
1308 		(ifp->if_flags & IFF_BROADCAST) &&
1309 		!(viftable[vifi].v_flags & VIFF_TUNNEL)) {
1310 	    struct mbuf *mm;
1311 	    struct igmpmsg *im;
1312 	    int hlen = ip->ip_hl << 2;
1313 	    struct timeval now;
1314 	    u_int32_t delta;
1315 
1316 	    microtime(&now);
1317 
1318 	    TV_DELTA(rt->mfc_last_assert, now, delta);
1319 
1320 	    if (delta > ASSERT_MSG_TIME) {
1321 		mm = m_copy(m, 0, hlen);
1322 		M_PULLUP(mm, hlen);
1323 		if (mm == 0) {
1324 		    return (ENOBUFS);
1325 		}
1326 
1327 		rt->mfc_last_assert = now;
1328 
1329 		im = mtod(mm, struct igmpmsg *);
1330 		im->im_msgtype	= IGMPMSG_WRONGVIF;
1331 		im->im_mbz	= 0;
1332 		im->im_vif	= vifi;
1333 
1334 		sin.sin_addr = im->im_src;
1335 
1336 		socket_send(ip_mrouter, mm, &sin);
1337 	    }
1338 	}
1339 	return (0);
1340     }
1341 
1342     /* If I sourced this packet, it counts as output, else it was input. */
1343     if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
1344 	viftable[vifi].v_pkt_out++;
1345 	viftable[vifi].v_bytes_out += plen;
1346     } else {
1347 	viftable[vifi].v_pkt_in++;
1348 	viftable[vifi].v_bytes_in += plen;
1349     }
1350     rt->mfc_pkt_cnt++;
1351     rt->mfc_byte_cnt += plen;
1352 
1353     /*
1354      * For each vif, decide if a copy of the packet should be forwarded.
1355      * Forward if:
1356      *		- the ttl exceeds the vif's threshold
1357      *		- there are group members downstream on interface
1358      */
1359     for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
1360 	if ((rt->mfc_ttls[vifi] > 0) &&
1361 	    (ip->ip_ttl > rt->mfc_ttls[vifi])) {
1362 	    vifp->v_pkt_out++;
1363 	    vifp->v_bytes_out += plen;
1364 	    MC_SEND(ip, vifp, m);
1365 	}
1366 
1367     return (0);
1368 }
1369 
1370 #ifdef RSVP_ISI
1371 /*
1372  * check if a vif number is legal/ok. This is used by ip_output, to export
1373  * numvifs there,
1374  */
1375 int
1376 legal_vif_num(vif)
1377     int vif;
1378 {
1379     if (vif >= 0 && vif < numvifs)
1380        return (1);
1381     else
1382        return (0);
1383 }
1384 #endif /* RSVP_ISI */
1385 
1386 static void
1387 phyint_send(ip, vifp, m)
1388 	struct ip *ip;
1389 	struct vif *vifp;
1390 	struct mbuf *m;
1391 {
1392 	struct mbuf *mb_copy;
1393 	int hlen = ip->ip_hl << 2;
1394 
1395 	/*
1396 	 * Make a new reference to the packet; make sure that
1397 	 * the IP header is actually copied, not just referenced,
1398 	 * so that ip_output() only scribbles on the copy.
1399 	 */
1400 	mb_copy = m_copy(m, 0, M_COPYALL);
1401 	M_PULLUP(mb_copy, hlen);
1402 	if (mb_copy == 0)
1403 		return;
1404 
1405 	if (vifp->v_rate_limit <= 0)
1406 		tbf_send_packet(vifp, mb_copy);
1407 	else
1408 		tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len);
1409 }
1410 
1411 static void
1412 encap_send(ip, vifp, m)
1413 	struct ip *ip;
1414 	struct vif *vifp;
1415 	struct mbuf *m;
1416 {
1417 	struct mbuf *mb_copy;
1418 	struct ip *ip_copy;
1419 	int i, len = ip->ip_len + sizeof(multicast_encap_iphdr);
1420 
1421 	/*
1422 	 * copy the old packet & pullup it's IP header into the
1423 	 * new mbuf so we can modify it.  Try to fill the new
1424 	 * mbuf since if we don't the ethernet driver will.
1425 	 */
1426 	MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
1427 	if (mb_copy == 0)
1428 		return;
1429 	mb_copy->m_data += max_linkhdr;
1430 	mb_copy->m_pkthdr.len = len;
1431 	mb_copy->m_len = sizeof(multicast_encap_iphdr);
1432 
1433 	if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == 0) {
1434 		m_freem(mb_copy);
1435 		return;
1436 	}
1437 	i = MHLEN - max_linkhdr;
1438 	if (i > len)
1439 		i = len;
1440 	mb_copy = m_pullup(mb_copy, i);
1441 	if (mb_copy == 0)
1442 		return;
1443 
1444 	/*
1445 	 * fill in the encapsulating IP header.
1446 	 */
1447 	ip_copy = mtod(mb_copy, struct ip *);
1448 	*ip_copy = multicast_encap_iphdr;
1449 	ip_copy->ip_id = htons(ip_id++);
1450 	ip_copy->ip_len = len;
1451 	ip_copy->ip_src = vifp->v_lcl_addr;
1452 	ip_copy->ip_dst = vifp->v_rmt_addr;
1453 
1454 	/*
1455 	 * turn the encapsulated IP header back into a valid one.
1456 	 */
1457 	ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
1458 	--ip->ip_ttl;
1459 	HTONS(ip->ip_len);
1460 	HTONS(ip->ip_off);
1461 	ip->ip_sum = 0;
1462 	mb_copy->m_data += sizeof(multicast_encap_iphdr);
1463 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1464 	mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1465 
1466 	if (vifp->v_rate_limit <= 0)
1467 		tbf_send_packet(vifp, mb_copy);
1468 	else
1469 		tbf_control(vifp, mb_copy, ip, ip_copy->ip_len);
1470 }
1471 
1472 /*
1473  * De-encapsulate a packet and feed it back through ip input.
1474  */
1475 static void
1476 #if __STDC__
1477 vif_input(struct mbuf *m, ...)
1478 #else
1479 vif_input(m, va_alist)
1480 	struct mbuf *m;
1481 	va_dcl
1482 #endif
1483 {
1484 	int off, proto;
1485 	va_list ap;
1486 	struct ip *ip;
1487 	struct vif *vifp;
1488 	int s;
1489 	struct ifqueue *ifq;
1490 
1491 	va_start(ap, m);
1492 	off = va_arg(ap, int);
1493 	proto = va_arg(ap, int);
1494 	va_end(ap);
1495 
1496 	vifp = (struct vif *)encap_getarg(m);
1497 	if (!vifp || proto != AF_INET) {
1498 		m_freem(m);
1499 		mrtstat.mrts_bad_tunnel++;
1500 		return;
1501 	}
1502 
1503 	ip = mtod(m, struct ip *);
1504 
1505 	m_adj(m, off);
1506 	m->m_pkthdr.rcvif = vifp->v_ifp;
1507 	ifq = &ipintrq;
1508 	s = splnet();
1509 	if (IF_QFULL(ifq)) {
1510 		IF_DROP(ifq);
1511 		m_freem(m);
1512 	} else {
1513 		IF_ENQUEUE(ifq, m);
1514 		/*
1515 		 * normally we would need a "schednetisr(NETISR_IP)"
1516 		 * here but we were called by ip_input and it is going
1517 		 * to loop back & try to dequeue the packet we just
1518 		 * queued as soon as we return so we avoid the
1519 		 * unnecessary software interrrupt.
1520 		 */
1521 	}
1522 	splx(s);
1523 }
1524 
1525 /*
1526  * Check if the packet should be grabbed by us.
1527  */
1528 static int
1529 vif_encapcheck(m, off, proto, arg)
1530 	const struct mbuf *m;
1531 	int off;
1532 	int proto;
1533 	void *arg;
1534 {
1535 	struct vif *vifp;
1536 	struct ip ip;
1537 
1538 #ifdef DIAGNOSTIC
1539 	if (!arg || proto != IPPROTO_IPV4)
1540 		panic("unexpected arg in vif_encapcheck");
1541 #endif
1542 
1543 	/*
1544 	 * do not grab the packet if it's not to a multicast destination or if
1545 	 * we don't have an encapsulating tunnel with the source.
1546 	 * Note:  This code assumes that the remote site IP address
1547 	 * uniquely identifies the tunnel (i.e., that this site has
1548 	 * at most one tunnel with the remote site).
1549 	 */
1550 
1551 	/* LINTED const cast */
1552 	m_copydata((struct mbuf *)m, off, sizeof(ip), (caddr_t)&ip);
1553 	if (!IN_MULTICAST(ip.ip_dst.s_addr))
1554 		return 0;
1555 
1556 	/* LINTED const cast */
1557 	m_copydata((struct mbuf *)m, 0, sizeof(ip), (caddr_t)&ip);
1558 	if (!in_hosteq(ip.ip_src, last_encap_src)) {
1559 		vifp = (struct vif *)arg;
1560 		if (vifp->v_flags & VIFF_TUNNEL &&
1561 		    in_hosteq(vifp->v_rmt_addr, ip.ip_src))
1562 			;
1563 		else
1564 			return 0;
1565 		last_encap_vif = vifp;
1566 		last_encap_src = ip.ip_src;
1567 	} else
1568 		vifp = last_encap_vif;
1569 
1570 	/* 32bit match, since we have checked ip_src only */
1571 	return 32;
1572 }
1573 
1574 /*
1575  * Token bucket filter module
1576  */
1577 static void
1578 tbf_control(vifp, m, ip, len)
1579 	struct vif *vifp;
1580 	struct mbuf *m;
1581 	struct ip *ip;
1582 	u_int32_t len;
1583 {
1584 
1585 	if (len > MAX_BKT_SIZE) {
1586 		/* drop if packet is too large */
1587 		mrtstat.mrts_pkt2large++;
1588 		m_freem(m);
1589 		return;
1590 	}
1591 
1592 	tbf_update_tokens(vifp);
1593 
1594 	/*
1595 	 * If there are enough tokens, and the queue is empty, send this packet
1596 	 * out immediately.  Otherwise, try to insert it on this vif's queue.
1597 	 */
1598 	if (vifp->tbf_q_len == 0) {
1599 		if (len <= vifp->tbf_n_tok) {
1600 			vifp->tbf_n_tok -= len;
1601 			tbf_send_packet(vifp, m);
1602 		} else {
1603 			/* queue packet and timeout till later */
1604 			tbf_queue(vifp, m);
1605 			callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1606 			    tbf_reprocess_q, vifp);
1607 		}
1608 	} else {
1609 		if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
1610 		    !tbf_dq_sel(vifp, ip)) {
1611 			/* queue length too much, and couldn't make room */
1612 			mrtstat.mrts_q_overflow++;
1613 			m_freem(m);
1614 		} else {
1615 			/* queue length low enough, or made room */
1616 			tbf_queue(vifp, m);
1617 			tbf_process_q(vifp);
1618 		}
1619 	}
1620 }
1621 
1622 /*
1623  * adds a packet to the queue at the interface
1624  */
1625 static void
1626 tbf_queue(vifp, m)
1627 	struct vif *vifp;
1628 	struct mbuf *m;
1629 {
1630 	int s = splsoftnet();
1631 
1632 	/* insert at tail */
1633 	*vifp->tbf_t = m;
1634 	vifp->tbf_t = &m->m_nextpkt;
1635 	vifp->tbf_q_len++;
1636 
1637 	splx(s);
1638 }
1639 
1640 
1641 /*
1642  * processes the queue at the interface
1643  */
1644 static void
1645 tbf_process_q(vifp)
1646 	struct vif *vifp;
1647 {
1648 	struct mbuf *m;
1649 	int len;
1650 	int s = splsoftnet();
1651 
1652 	/*
1653 	 * Loop through the queue at the interface and send as many packets
1654 	 * as possible.
1655 	 */
1656 	for (m = vifp->tbf_q;
1657 	    m != 0;
1658 	    m = vifp->tbf_q) {
1659 		len = mtod(m, struct ip *)->ip_len;
1660 
1661 		/* determine if the packet can be sent */
1662 		if (len <= vifp->tbf_n_tok) {
1663 			/* if so,
1664 			 * reduce no of tokens, dequeue the packet,
1665 			 * send the packet.
1666 			 */
1667 			if ((vifp->tbf_q = m->m_nextpkt) == 0)
1668 				vifp->tbf_t = &vifp->tbf_q;
1669 			--vifp->tbf_q_len;
1670 
1671 			m->m_nextpkt = 0;
1672 			vifp->tbf_n_tok -= len;
1673 			tbf_send_packet(vifp, m);
1674 		} else
1675 			break;
1676 	}
1677 	splx(s);
1678 }
1679 
1680 static void
1681 tbf_reprocess_q(arg)
1682 	void *arg;
1683 {
1684 	struct vif *vifp = arg;
1685 
1686 	if (ip_mrouter == 0)
1687 		return;
1688 
1689 	tbf_update_tokens(vifp);
1690 	tbf_process_q(vifp);
1691 
1692 	if (vifp->tbf_q_len != 0)
1693 		callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1694 		    tbf_reprocess_q, vifp);
1695 }
1696 
1697 /* function that will selectively discard a member of the queue
1698  * based on the precedence value and the priority
1699  */
1700 static int
1701 tbf_dq_sel(vifp, ip)
1702 	struct vif *vifp;
1703 	struct ip *ip;
1704 {
1705 	u_int p;
1706 	struct mbuf **mp, *m;
1707 	int s = splsoftnet();
1708 
1709 	p = priority(vifp, ip);
1710 
1711 	for (mp = &vifp->tbf_q, m = *mp;
1712 	    m != 0;
1713 	    mp = &m->m_nextpkt, m = *mp) {
1714 		if (p > priority(vifp, mtod(m, struct ip *))) {
1715 			if ((*mp = m->m_nextpkt) == 0)
1716 				vifp->tbf_t = mp;
1717 			--vifp->tbf_q_len;
1718 
1719 			m_freem(m);
1720 			mrtstat.mrts_drop_sel++;
1721 			splx(s);
1722 			return (1);
1723 		}
1724 	}
1725 	splx(s);
1726 	return (0);
1727 }
1728 
1729 static void
1730 tbf_send_packet(vifp, m)
1731 	struct vif *vifp;
1732 	struct mbuf *m;
1733 {
1734 	int error;
1735 	int s = splsoftnet();
1736 
1737 	if (vifp->v_flags & VIFF_TUNNEL) {
1738 		/* If tunnel options */
1739 #ifdef IPSEC
1740 		/* Don't lookup socket in forwading case */
1741 		(void)ipsec_setsocket(m, NULL);
1742 #endif
1743 		ip_output(m, (struct mbuf *)0, &vifp->v_route,
1744 			  IP_FORWARDING, (struct ip_moptions *)0);
1745 	} else {
1746 		/* if physical interface option, extract the options and then send */
1747 		struct ip_moptions imo;
1748 
1749 		imo.imo_multicast_ifp = vifp->v_ifp;
1750 		imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
1751 		imo.imo_multicast_loop = 1;
1752 #ifdef RSVP_ISI
1753 		imo.imo_multicast_vif = -1;
1754 #endif
1755 
1756 #ifdef IPSEC
1757 		/* Don't lookup socket in forwading case */
1758 		(void)ipsec_setsocket(m, NULL);
1759 #endif
1760 		error = ip_output(m, (struct mbuf *)0, (struct route *)0,
1761 				  IP_FORWARDING|IP_MULTICASTOPTS, &imo);
1762 
1763 		if (mrtdebug & DEBUG_XMIT)
1764 			log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
1765 			    (long)(vifp-viftable), error);
1766 	}
1767 	splx(s);
1768 }
1769 
1770 /* determine the current time and then
1771  * the elapsed time (between the last time and time now)
1772  * in milliseconds & update the no. of tokens in the bucket
1773  */
1774 static void
1775 tbf_update_tokens(vifp)
1776 	struct vif *vifp;
1777 {
1778 	struct timeval tp;
1779 	u_int32_t tm;
1780 	int s = splsoftnet();
1781 
1782 	microtime(&tp);
1783 
1784 	TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
1785 
1786 	/*
1787 	 * This formula is actually
1788 	 * "time in seconds" * "bytes/second".
1789 	 *
1790 	 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
1791 	 *
1792 	 * The (1000/1024) was introduced in add_vif to optimize
1793 	 * this divide into a shift.
1794 	 */
1795 	vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
1796 	vifp->tbf_last_pkt_t = tp;
1797 
1798 	if (vifp->tbf_n_tok > MAX_BKT_SIZE)
1799 		vifp->tbf_n_tok = MAX_BKT_SIZE;
1800 
1801 	splx(s);
1802 }
1803 
1804 static int
1805 priority(vifp, ip)
1806     struct vif *vifp;
1807     struct ip *ip;
1808 {
1809     int prio;
1810 
1811     /* temporary hack; may add general packet classifier some day */
1812 
1813     /*
1814      * The UDP port space is divided up into four priority ranges:
1815      * [0, 16384)     : unclassified - lowest priority
1816      * [16384, 32768) : audio - highest priority
1817      * [32768, 49152) : whiteboard - medium priority
1818      * [49152, 65536) : video - low priority
1819      */
1820     if (ip->ip_p == IPPROTO_UDP) {
1821 	struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
1822 
1823 	switch (ntohs(udp->uh_dport) & 0xc000) {
1824 	    case 0x4000:
1825 		prio = 70;
1826 		break;
1827 	    case 0x8000:
1828 		prio = 60;
1829 		break;
1830 	    case 0xc000:
1831 		prio = 55;
1832 		break;
1833 	    default:
1834 		prio = 50;
1835 		break;
1836 	}
1837 
1838 	if (tbfdebug > 1)
1839 	    log(LOG_DEBUG, "port %x prio %d\n", ntohs(udp->uh_dport), prio);
1840     } else
1841 	prio = 50;
1842 
1843 
1844     return (prio);
1845 }
1846 
1847 /*
1848  * End of token bucket filter modifications
1849  */
1850 
1851 #ifdef RSVP_ISI
1852 
1853 int
1854 ip_rsvp_vif_init(so, m)
1855     struct socket *so;
1856     struct mbuf *m;
1857 {
1858     int i;
1859     int s;
1860 
1861     if (rsvpdebug)
1862 	printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
1863 	    so->so_type, so->so_proto->pr_protocol);
1864 
1865     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1866 	return (EOPNOTSUPP);
1867 
1868     /* Check mbuf. */
1869     if (m == 0 || m->m_len != sizeof(int)) {
1870 	return (EINVAL);
1871     }
1872     i = *(mtod(m, int *));
1873 
1874     if (rsvpdebug)
1875 	printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",i,rsvp_on);
1876 
1877     s = splsoftnet();
1878 
1879     /* Check vif. */
1880     if (!legal_vif_num(i)) {
1881 	splx(s);
1882 	return (EADDRNOTAVAIL);
1883     }
1884 
1885     /* Check if socket is available. */
1886     if (viftable[i].v_rsvpd != 0) {
1887 	splx(s);
1888 	return (EADDRINUSE);
1889     }
1890 
1891     viftable[i].v_rsvpd = so;
1892     /* This may seem silly, but we need to be sure we don't over-increment
1893      * the RSVP counter, in case something slips up.
1894      */
1895     if (!viftable[i].v_rsvp_on) {
1896 	viftable[i].v_rsvp_on = 1;
1897 	rsvp_on++;
1898     }
1899 
1900     splx(s);
1901     return (0);
1902 }
1903 
1904 int
1905 ip_rsvp_vif_done(so, m)
1906     struct socket *so;
1907     struct mbuf *m;
1908 {
1909     int i;
1910     int s;
1911 
1912     if (rsvpdebug)
1913 	printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
1914 	       so->so_type, so->so_proto->pr_protocol);
1915 
1916     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1917 	return (EOPNOTSUPP);
1918 
1919     /* Check mbuf. */
1920     if (m == 0 || m->m_len != sizeof(int)) {
1921 	return (EINVAL);
1922     }
1923     i = *(mtod(m, int *));
1924 
1925     s = splsoftnet();
1926 
1927     /* Check vif. */
1928     if (!legal_vif_num(i)) {
1929 	splx(s);
1930         return (EADDRNOTAVAIL);
1931     }
1932 
1933     if (rsvpdebug)
1934 	printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n",
1935 	    viftable[i].v_rsvpd, so);
1936 
1937     viftable[i].v_rsvpd = 0;
1938     /* This may seem silly, but we need to be sure we don't over-decrement
1939      * the RSVP counter, in case something slips up.
1940      */
1941     if (viftable[i].v_rsvp_on) {
1942 	viftable[i].v_rsvp_on = 0;
1943 	rsvp_on--;
1944     }
1945 
1946     splx(s);
1947     return (0);
1948 }
1949 
1950 void
1951 ip_rsvp_force_done(so)
1952     struct socket *so;
1953 {
1954     int vifi;
1955     int s;
1956 
1957     /* Don't bother if it is not the right type of socket. */
1958     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1959 	return;
1960 
1961     s = splsoftnet();
1962 
1963     /* The socket may be attached to more than one vif...this
1964      * is perfectly legal.
1965      */
1966     for (vifi = 0; vifi < numvifs; vifi++) {
1967 	if (viftable[vifi].v_rsvpd == so) {
1968 	    viftable[vifi].v_rsvpd = 0;
1969 	    /* This may seem silly, but we need to be sure we don't
1970 	     * over-decrement the RSVP counter, in case something slips up.
1971 	     */
1972 	    if (viftable[vifi].v_rsvp_on) {
1973 		viftable[vifi].v_rsvp_on = 0;
1974 		rsvp_on--;
1975 	    }
1976 	}
1977     }
1978 
1979     splx(s);
1980     return;
1981 }
1982 
1983 void
1984 rsvp_input(m, ifp)
1985     struct mbuf *m;
1986     struct ifnet *ifp;
1987 {
1988     int vifi;
1989     struct ip *ip = mtod(m, struct ip *);
1990     static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET };
1991     int s;
1992 
1993     if (rsvpdebug)
1994 	printf("rsvp_input: rsvp_on %d\n",rsvp_on);
1995 
1996     /* Can still get packets with rsvp_on = 0 if there is a local member
1997      * of the group to which the RSVP packet is addressed.  But in this
1998      * case we want to throw the packet away.
1999      */
2000     if (!rsvp_on) {
2001 	m_freem(m);
2002 	return;
2003     }
2004 
2005     /* If the old-style non-vif-associated socket is set, then use
2006      * it and ignore the new ones.
2007      */
2008     if (ip_rsvpd != 0) {
2009 	if (rsvpdebug)
2010 	    printf("rsvp_input: Sending packet up old-style socket\n");
2011 	rip_input(m);	/*XXX*/
2012 	return;
2013     }
2014 
2015     s = splsoftnet();
2016 
2017     if (rsvpdebug)
2018 	printf("rsvp_input: check vifs\n");
2019 
2020     /* Find which vif the packet arrived on. */
2021     for (vifi = 0; vifi < numvifs; vifi++) {
2022 	if (viftable[vifi].v_ifp == ifp)
2023 	    break;
2024     }
2025 
2026     if (vifi == numvifs) {
2027 	/* Can't find vif packet arrived on. Drop packet. */
2028 	if (rsvpdebug)
2029 	    printf("rsvp_input: Can't find vif for packet...dropping it.\n");
2030 	m_freem(m);
2031 	splx(s);
2032 	return;
2033     }
2034 
2035     if (rsvpdebug)
2036 	printf("rsvp_input: check socket\n");
2037 
2038     if (viftable[vifi].v_rsvpd == 0) {
2039 	/* drop packet, since there is no specific socket for this
2040 	 * interface */
2041 	if (rsvpdebug)
2042 	    printf("rsvp_input: No socket defined for vif %d\n",vifi);
2043 	m_freem(m);
2044 	splx(s);
2045 	return;
2046     }
2047 
2048     rsvp_src.sin_addr = ip->ip_src;
2049 
2050     if (rsvpdebug && m)
2051 	printf("rsvp_input: m->m_len = %d, sbspace() = %d\n",
2052 	       m->m_len,sbspace(&viftable[vifi].v_rsvpd->so_rcv));
2053 
2054     if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0)
2055 	if (rsvpdebug)
2056 	    printf("rsvp_input: Failed to append to socket\n");
2057     else
2058 	if (rsvpdebug)
2059 	    printf("rsvp_input: send packet up\n");
2060 
2061     splx(s);
2062 }
2063 #endif /* RSVP_ISI */
2064