xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision dd4eeefd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.
23  * All rights reserved.  Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Procedures for the kernel part of DVMRP,
31  * a Distance-Vector Multicast Routing Protocol.
32  * (See RFC-1075)
33  * Written by David Waitzman, BBN Labs, August 1988.
34  * Modified by Steve Deering, Stanford, February 1989.
35  * Modified by Mark J. Steiglitz, Stanford, May, 1991
36  * Modified by Van Jacobson, LBL, January 1993
37  * Modified by Ajit Thyagarajan, PARC, August 1993
38  * Modified by Bill Fenner, PARC, April 1995
39  *
40  * MROUTING 3.5
41  */
42 
43 /*
44  * TODO
45  * - function pointer field in vif, void *vif_sendit()
46  */
47 
48 #include <sys/types.h>
49 #include <sys/stream.h>
50 #include <sys/stropts.h>
51 #include <sys/strlog.h>
52 #include <sys/systm.h>
53 #include <sys/ddi.h>
54 #include <sys/cmn_err.h>
55 #include <sys/zone.h>
56 
57 #include <sys/param.h>
58 #include <sys/socket.h>
59 #include <sys/vtrace.h>
60 #include <sys/debug.h>
61 #include <net/if.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <net/if_dl.h>
65 
66 #include <inet/common.h>
67 #include <inet/mi.h>
68 #include <inet/nd.h>
69 #include <inet/mib2.h>
70 #include <netinet/ip6.h>
71 #include <inet/ip.h>
72 #include <inet/snmpcom.h>
73 
74 #include <netinet/igmp.h>
75 #include <netinet/igmp_var.h>
76 #include <netinet/udp.h>
77 #include <netinet/ip_mroute.h>
78 #include <inet/ip_multi.h>
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 #include <inet/ipclassifier.h>
82 
83 #include <netinet/pim.h>
84 
85 
86 /*
87  * MT Design:
88  *
89  * There are three main data structures viftable, mfctable and tbftable that
90  * need to be protected against MT races.
91  *
92  * vitable is a fixed length array of vif structs. There is no lock to protect
93  * the whole array, instead each struct is protected by its own indiviual lock.
94  * The value of v_marks in conjuction with the value of v_refcnt determines the
95  * current state of a vif structure. One special state that needs mention
96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
97  * that vif is being initalized.
98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
99  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
100  * which prevents the struct from further use.  When the refcnt goes to zero
101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
103  * from  going away a refhold is put on the ipif before using it. see
104  * lock_good_vif() and unlock_good_vif().
105  *
106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
107  * of the vif struct.
108  *
109  * tbftable is also a fixed length array of tbf structs and is only accessed
110  * via v_tbf.  It is protected by its own lock tbf_lock.
111  *
112  * Lock Ordering is
113  * v_lock --> tbf_lock
114  * v_lock --> ill_locK
115  *
116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
120  * protect the struct elements.
121  *
122  * mfc structs are dynamically allocated and are singly linked
123  * at the head of the chain. When an mfc structure is to be deleted
124  * it is marked condemned and so is the state in the bucket struct.
125  * When the last walker of the hash bucket exits all the mfc structs
126  * marked condemed are freed.
127  *
128  * Locking Hierarchy:
129  * The bucket lock should be acquired before the mfc struct lock.
130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
131  * operations on the bucket struct.
132  *
133  * last_encap_lock and numvifs_mutex should be acquired after
134  * acquring vif or mfc locks. These locks protect some global variables.
135  *
136  * The statistics are not currently protected by a lock
137  * causing the stats be be approximate, not exact.
138  */
139 
140 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
141 
142 /*
143  * Timeouts:
144  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
145  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
146  *	SunOS 5.x uses mfc->timeout for each mfc.
147  *	Some Unixes are limited in the number of simultaneous timeouts
148  * 	that can be run, SunOS 5.x does not have this restriction.
149  */
150 
151 /*
152  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
153  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
154  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
155  */
156 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
157 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
158 
159 /*
160  * Hash function for a source, group entry
161  */
162 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
163 	((g) >> 20) ^ ((g) >> 10) ^ (g))
164 
165 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
166 
167 /* Identify PIM packet that came on a Register interface */
168 #define	PIM_REGISTER_MARKER	0xffffffff
169 
170 /* Function declarations */
171 static int	add_mfc(struct mfcctl *, ip_stack_t *);
172 static int	add_vif(struct vifctl *, queue_t *, mblk_t *, ip_stack_t *);
173 static int	del_mfc(struct mfcctl *, ip_stack_t *);
174 static int	del_vif(vifi_t *, queue_t *, mblk_t *, ip_stack_t *);
175 static void	del_vifp(struct vif *);
176 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
177 static void	expire_upcalls(void *);
178 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
179 static void	free_queue(struct mfc *);
180 static int	get_assert(uchar_t *, ip_stack_t *);
181 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
182 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
183 static int	get_version(uchar_t *);
184 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
185 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
186 		    ipaddr_t, struct mfc *);
187 static int	ip_mrouter_init(queue_t *, uchar_t *, int, ip_stack_t *);
188 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
189 static int	register_mforward(queue_t *, mblk_t *, ill_t *);
190 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
191 static int	set_assert(int *, ip_stack_t *);
192 
193 /*
194  * Token Bucket Filter functions
195  */
196 static int  priority(struct vif *, ipha_t *);
197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
198 static int  tbf_dq_sel(struct vif *, ipha_t *);
199 static void tbf_process_q(struct vif *);
200 static void tbf_queue(struct vif *, mblk_t *);
201 static void tbf_reprocess_q(void *);
202 static void tbf_send_packet(struct vif *, mblk_t *);
203 static void tbf_update_tokens(struct vif *);
204 static void release_mfc(struct mfcb *);
205 
206 static boolean_t is_mrouter_off(ip_stack_t *);
207 /*
208  * Encapsulation packets
209  */
210 
211 #define	ENCAP_TTL	64
212 
213 /* prototype IP hdr for encapsulated packets */
214 static ipha_t multicast_encap_iphdr = {
215 	IP_SIMPLE_HDR_VERSION,
216 	0,				/* tos */
217 	sizeof (ipha_t),		/* total length */
218 	0,				/* id */
219 	0,				/* frag offset */
220 	ENCAP_TTL, IPPROTO_ENCAP,
221 	0,				/* checksum */
222 };
223 
224 /*
225  * Rate limit for assert notification messages, in nsec.
226  */
227 #define	ASSERT_MSG_TIME		3000000000
228 
229 
230 #define	VIF_REFHOLD(vifp) {			\
231 	mutex_enter(&(vifp)->v_lock);		\
232 	(vifp)->v_refcnt++;			\
233 	mutex_exit(&(vifp)->v_lock);		\
234 }
235 
236 #define	VIF_REFRELE_LOCKED(vifp) {				\
237 	(vifp)->v_refcnt--;					\
238 	if ((vifp)->v_refcnt == 0 &&				\
239 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
240 			del_vifp(vifp);				\
241 	} else {						\
242 		mutex_exit(&(vifp)->v_lock);			\
243 	}							\
244 }
245 
246 #define	VIF_REFRELE(vifp) {					\
247 	mutex_enter(&(vifp)->v_lock);				\
248 	(vifp)->v_refcnt--;					\
249 	if ((vifp)->v_refcnt == 0 &&				\
250 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
251 			del_vifp(vifp);				\
252 	} else {						\
253 		mutex_exit(&(vifp)->v_lock);			\
254 	}							\
255 }
256 
257 #define	MFCB_REFHOLD(mfcb) {				\
258 	mutex_enter(&(mfcb)->mfcb_lock);		\
259 	(mfcb)->mfcb_refcnt++;				\
260 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
261 	mutex_exit(&(mfcb)->mfcb_lock);			\
262 }
263 
264 #define	MFCB_REFRELE(mfcb) {					\
265 	mutex_enter(&(mfcb)->mfcb_lock);			\
266 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
267 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
268 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
269 			release_mfc(mfcb);			\
270 	}							\
271 	mutex_exit(&(mfcb)->mfcb_lock);				\
272 }
273 
274 /*
275  * MFCFIND:
276  * Find a route for a given origin IP address and multicast group address.
277  * Skip entries with pending upcalls.
278  * Type of service parameter to be added in the future!
279  */
280 #define	MFCFIND(mfcbp, o, g, rt) { \
281 	struct mfc *_mb_rt = NULL; \
282 	rt = NULL; \
283 	_mb_rt = mfcbp->mfcb_mfc; \
284 	while (_mb_rt) { \
285 		if ((_mb_rt->mfc_origin.s_addr == o) && \
286 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
287 		    (_mb_rt->mfc_rte == NULL) && \
288 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
289 		    rt = _mb_rt; \
290 		    break; \
291 		} \
292 	_mb_rt = _mb_rt->mfc_next; \
293 	} \
294 }
295 
296 /*
297  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
298  * are inefficient. We use gethrestime() which returns a timespec_t with
299  * sec and nsec, the resolution is machine dependent.
300  * The following 2 macros have been changed to use nsec instead of usec.
301  */
302 /*
303  * Macros to compute elapsed time efficiently.
304  * Borrowed from Van Jacobson's scheduling code.
305  * Delta should be a hrtime_t.
306  */
307 #define	TV_DELTA(a, b, delta) { \
308 	int xxs; \
309  \
310 	delta = (a).tv_nsec - (b).tv_nsec; \
311 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
312 		switch (xxs) { \
313 		case 2: \
314 		    delta += 1000000000; \
315 		    /*FALLTHROUGH*/ \
316 		case 1: \
317 		    delta += 1000000000; \
318 		    break; \
319 		default: \
320 		    delta += (1000000000 * xxs); \
321 		} \
322 	} \
323 }
324 
325 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
326 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
327 
328 /*
329  * Handle MRT setsockopt commands to modify the multicast routing tables.
330  */
331 int
332 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
333     int datalen, mblk_t *first_mp)
334 {
335 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
336 
337 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
338 	if (cmd != MRT_INIT && q != ipst->ips_ip_g_mrouter) {
339 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
340 		return (EACCES);
341 	}
342 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
343 
344 	if (checkonly) {
345 		/*
346 		 * do not do operation, just pretend to - new T_CHECK
347 		 * Note: Even routines further on can probably fail but
348 		 * this T_CHECK stuff is only to please XTI so it not
349 		 * necessary to be perfect.
350 		 */
351 		switch (cmd) {
352 		case MRT_INIT:
353 		case MRT_DONE:
354 		case MRT_ADD_VIF:
355 		case MRT_DEL_VIF:
356 		case MRT_ADD_MFC:
357 		case MRT_DEL_MFC:
358 		case MRT_ASSERT:
359 		    return (0);
360 		default:
361 		    return (EOPNOTSUPP);
362 		}
363 	}
364 
365 	/*
366 	 * make sure no command is issued after multicast routing has been
367 	 * turned off.
368 	 */
369 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
370 		if (is_mrouter_off(ipst))
371 			return (EINVAL);
372 	}
373 
374 	switch (cmd) {
375 	case MRT_INIT:	return (ip_mrouter_init(q, data, datalen, ipst));
376 	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
377 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, q, first_mp,
378 				    ipst));
379 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, q, first_mp, ipst));
380 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
381 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
382 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
383 	default:	   return (EOPNOTSUPP);
384 	}
385 }
386 
387 /*
388  * Handle MRT getsockopt commands
389  */
390 int
391 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
392 {
393 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
394 
395 	if (q != ipst->ips_ip_g_mrouter)
396 		return (EACCES);
397 
398 	switch (cmd) {
399 	case MRT_VERSION:	return (get_version((uchar_t *)data));
400 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
401 	default:		return (EOPNOTSUPP);
402 	}
403 }
404 
405 /*
406  * Handle ioctl commands to obtain information from the cache.
407  * Called with shared access to IP. These are read_only ioctls.
408  */
409 /* ARGSUSED */
410 int
411 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
412     ip_ioctl_cmd_t *ipip, void *if_req)
413 {
414 	mblk_t	*mp1;
415 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
416 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
417 
418 	/* Existence verified in ip_wput_nondata */
419 	mp1 = mp->b_cont->b_cont;
420 
421 	switch (iocp->ioc_cmd) {
422 	case (SIOCGETVIFCNT):
423 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
424 	case (SIOCGETSGCNT):
425 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
426 	case (SIOCGETLSGCNT):
427 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
428 	default:
429 		return (EINVAL);
430 	}
431 }
432 
433 /*
434  * Returns the packet, byte, rpf-failure count for the source, group provided.
435  */
436 static int
437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
438 {
439 	struct mfc *rt;
440 	struct mfcb *mfcbp;
441 
442 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
443 	MFCB_REFHOLD(mfcbp);
444 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
445 
446 	if (rt != NULL) {
447 		mutex_enter(&rt->mfc_mutex);
448 		req->pktcnt   = rt->mfc_pkt_cnt;
449 		req->bytecnt  = rt->mfc_byte_cnt;
450 		req->wrong_if = rt->mfc_wrong_if;
451 		mutex_exit(&rt->mfc_mutex);
452 	} else
453 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
454 
455 	MFCB_REFRELE(mfcbp);
456 	return (0);
457 }
458 
459 /*
460  * Returns the packet, byte, rpf-failure count for the source, group provided.
461  * Uses larger counters and IPv6 addresses.
462  */
463 /* ARGSUSED XXX until implemented */
464 static int
465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
466 {
467 	/* XXX TODO SIOCGETLSGCNT */
468 	return (ENXIO);
469 }
470 
471 /*
472  * Returns the input and output packet and byte counts on the vif provided.
473  */
474 static int
475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
476 {
477 	vifi_t vifi = req->vifi;
478 
479 	if (vifi >= ipst->ips_numvifs)
480 		return (EINVAL);
481 
482 	/*
483 	 * No locks here, an approximation is fine.
484 	 */
485 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
486 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
487 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
488 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
489 
490 	return (0);
491 }
492 
493 static int
494 get_version(uchar_t *data)
495 {
496 	int *v = (int *)data;
497 
498 	*v = 0x0305;	/* XXX !!!! */
499 
500 	return (0);
501 }
502 
503 /*
504  * Set PIM assert processing global.
505  */
506 static int
507 set_assert(int *i, ip_stack_t *ipst)
508 {
509 	if ((*i != 1) && (*i != 0))
510 		return (EINVAL);
511 
512 	ipst->ips_pim_assert = *i;
513 
514 	return (0);
515 }
516 
517 /*
518  * Get PIM assert processing global.
519  */
520 static int
521 get_assert(uchar_t *data, ip_stack_t *ipst)
522 {
523 	int *i = (int *)data;
524 
525 	*i = ipst->ips_pim_assert;
526 
527 	return (0);
528 }
529 
530 /*
531  * Enable multicast routing.
532  */
533 static int
534 ip_mrouter_init(queue_t *q, uchar_t *data, int datalen, ip_stack_t *ipst)
535 {
536 	conn_t	*connp = Q_TO_CONN(q);
537 	int	*v;
538 
539 	if (data == NULL || (datalen != sizeof (int)))
540 		return (ENOPROTOOPT);
541 
542 	v = (int *)data;
543 	if (*v != 1)
544 		return (ENOPROTOOPT);
545 
546 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
547 	if (ipst->ips_ip_g_mrouter != NULL) {
548 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
549 		return (EADDRINUSE);
550 	}
551 
552 	ipst->ips_ip_g_mrouter = q;
553 	connp->conn_multi_router = 1;
554 	/* In order for tunnels to work we have to turn ip_g_forward on */
555 	if (!WE_ARE_FORWARDING(ipst)) {
556 		if (ipst->ips_ip_mrtdebug > 1) {
557 			(void) mi_strlog(q, 1, SL_TRACE,
558 			    "ip_mrouter_init: turning on forwarding");
559 		}
560 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
561 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
562 	}
563 
564 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
565 	return (0);
566 }
567 
568 void
569 ip_mrouter_stack_init(ip_stack_t *ipst)
570 {
571 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
572 
573 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
574 	    KM_SLEEP);
575 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
576 	/*
577 	 * mfctable:
578 	 * Includes all mfcs, including waiting upcalls.
579 	 * Multiple mfcs per bucket.
580 	 */
581 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
582 	    KM_SLEEP);
583 	/*
584 	 * Define the token bucket filter structures.
585 	 * tbftable -> each vif has one of these for storing info.
586 	 */
587 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
588 
589 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
590 
591 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
592 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
593 }
594 
595 /*
596  * Disable multicast routing.
597  * Didn't use global timeout_val (BSD version), instead check the mfctable.
598  */
599 int
600 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
601 {
602 	conn_t		*connp;
603 	vifi_t 		vifi;
604 	struct mfc	*mfc_rt;
605 	int		i;
606 
607 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
608 	if (ipst->ips_ip_g_mrouter == NULL) {
609 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
610 		return (EINVAL);
611 	}
612 
613 	connp = Q_TO_CONN(ipst->ips_ip_g_mrouter);
614 
615 	if (ipst->ips_saved_ip_g_forward != -1) {
616 		if (ipst->ips_ip_mrtdebug > 1) {
617 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
618 			    "ip_mrouter_done: turning off forwarding");
619 		}
620 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
621 		ipst->ips_saved_ip_g_forward = -1;
622 	}
623 
624 	/*
625 	 * Always clear cache when vifs change.
626 	 * No need to get ipst->ips_last_encap_lock since we are running as
627 	 * a writer.
628 	 */
629 	mutex_enter(&ipst->ips_last_encap_lock);
630 	ipst->ips_last_encap_src = 0;
631 	ipst->ips_last_encap_vif = NULL;
632 	mutex_exit(&ipst->ips_last_encap_lock);
633 	connp->conn_multi_router = 0;
634 
635 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
636 
637 	/*
638 	 * For each phyint in use,
639 	 * disable promiscuous reception of all IP multicasts.
640 	 */
641 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
642 		struct vif *vifp = ipst->ips_vifs + vifi;
643 
644 		mutex_enter(&vifp->v_lock);
645 		/*
646 		 * if the vif is active mark it condemned.
647 		 */
648 		if (vifp->v_marks & VIF_MARK_GOOD) {
649 			ASSERT(vifp->v_ipif != NULL);
650 			ipif_refhold(vifp->v_ipif);
651 			/* Phyint only */
652 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
653 				ipif_t *ipif = vifp->v_ipif;
654 				ipsq_t  *ipsq;
655 				boolean_t suc;
656 				ill_t *ill;
657 
658 				ill = ipif->ipif_ill;
659 				suc = B_FALSE;
660 				if (mp == NULL) {
661 					/*
662 					 * being called from ip_close,
663 					 * lets do it synchronously.
664 					 * Clear VIF_MARK_GOOD and
665 					 * set VIF_MARK_CONDEMNED.
666 					 */
667 					vifp->v_marks &= ~VIF_MARK_GOOD;
668 					vifp->v_marks |= VIF_MARK_CONDEMNED;
669 					mutex_exit(&(vifp)->v_lock);
670 					suc = ipsq_enter(ill, B_FALSE);
671 					ipsq = ill->ill_phyint->phyint_ipsq;
672 				} else {
673 					ipsq = ipsq_try_enter(ipif, NULL,
674 					    ipst->ips_ip_g_mrouter, mp,
675 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
676 					if (ipsq == NULL) {
677 						mutex_exit(&(vifp)->v_lock);
678 						return (EINPROGRESS);
679 					}
680 					/*
681 					 * Clear VIF_MARK_GOOD and
682 					 * set VIF_MARK_CONDEMNED.
683 					 */
684 					vifp->v_marks &= ~VIF_MARK_GOOD;
685 					vifp->v_marks |= VIF_MARK_CONDEMNED;
686 						mutex_exit(&(vifp)->v_lock);
687 					suc = B_TRUE;
688 				}
689 
690 				if (suc) {
691 					(void) ip_delmulti(INADDR_ANY, ipif,
692 					    B_TRUE, B_TRUE);
693 					ipsq_exit(ipsq, B_TRUE, B_TRUE);
694 				}
695 				mutex_enter(&vifp->v_lock);
696 			}
697 			/*
698 			 * decreases the refcnt added in add_vif.
699 			 * and release v_lock.
700 			 */
701 			VIF_REFRELE_LOCKED(vifp);
702 		} else {
703 			mutex_exit(&vifp->v_lock);
704 			continue;
705 		}
706 	}
707 
708 	mutex_enter(&ipst->ips_numvifs_mutex);
709 	ipst->ips_numvifs = 0;
710 	ipst->ips_pim_assert = 0;
711 	ipst->ips_reg_vif_num = ALL_VIFS;
712 	mutex_exit(&ipst->ips_numvifs_mutex);
713 
714 	/*
715 	 * Free upcall msgs.
716 	 * Go through mfctable and stop any outstanding upcall
717 	 * timeouts remaining on mfcs.
718 	 */
719 	for (i = 0; i < MFCTBLSIZ; i++) {
720 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
721 		ipst->ips_mfcs[i].mfcb_refcnt++;
722 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
723 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
724 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
725 		while (mfc_rt) {
726 			/* Free upcalls */
727 			mutex_enter(&mfc_rt->mfc_mutex);
728 			if (mfc_rt->mfc_rte != NULL) {
729 				if (mfc_rt->mfc_timeout_id != 0) {
730 					/*
731 					 * OK to drop the lock as we have
732 					 * a refcnt on the bucket. timeout
733 					 * can fire but it will see that
734 					 * mfc_timeout_id == 0 and not do
735 					 * anything. see expire_upcalls().
736 					 */
737 					mfc_rt->mfc_timeout_id = 0;
738 					mutex_exit(&mfc_rt->mfc_mutex);
739 					(void) untimeout(
740 					    mfc_rt->mfc_timeout_id);
741 						mfc_rt->mfc_timeout_id = 0;
742 					mutex_enter(&mfc_rt->mfc_mutex);
743 
744 					/*
745 					 * all queued upcall packets
746 					 * and mblk will be freed in
747 					 * release_mfc().
748 					 */
749 				}
750 			}
751 
752 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
753 
754 			mutex_exit(&mfc_rt->mfc_mutex);
755 			mfc_rt = mfc_rt->mfc_next;
756 		}
757 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
758 	}
759 
760 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
761 	ipst->ips_ip_g_mrouter = NULL;
762 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
763 	return (0);
764 }
765 
766 void
767 ip_mrouter_stack_destroy(ip_stack_t *ipst)
768 {
769 	struct mfcb *mfcbp;
770 	struct mfc  *rt;
771 	int i;
772 
773 	for (i = 0; i < MFCTBLSIZ; i++) {
774 		mfcbp = &ipst->ips_mfcs[i];
775 
776 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
777 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
778 			    i);
779 
780 			mfcbp->mfcb_mfc = rt->mfc_next;
781 			free_queue(rt);
782 			mi_free(rt);
783 		}
784 	}
785 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
786 	ipst->ips_vifs = NULL;
787 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
788 	ipst->ips_mrtstat = NULL;
789 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
790 	ipst->ips_mfcs = NULL;
791 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
792 	ipst->ips_tbfs = NULL;
793 
794 	mutex_destroy(&ipst->ips_last_encap_lock);
795 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
796 }
797 
798 static boolean_t
799 is_mrouter_off(ip_stack_t *ipst)
800 {
801 	conn_t	*connp;
802 
803 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
804 	if (ipst->ips_ip_g_mrouter == NULL) {
805 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
806 		return (B_TRUE);
807 	}
808 
809 	connp = Q_TO_CONN(ipst->ips_ip_g_mrouter);
810 	if (connp->conn_multi_router == 0) {
811 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
812 		return (B_TRUE);
813 	}
814 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
815 	return (B_FALSE);
816 }
817 
818 static void
819 unlock_good_vif(struct vif *vifp)
820 {
821 	ASSERT(vifp->v_ipif != NULL);
822 	ipif_refrele(vifp->v_ipif);
823 	VIF_REFRELE(vifp);
824 }
825 
826 static boolean_t
827 lock_good_vif(struct vif *vifp)
828 {
829 	mutex_enter(&vifp->v_lock);
830 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
831 		mutex_exit(&vifp->v_lock);
832 		return (B_FALSE);
833 	}
834 
835 	ASSERT(vifp->v_ipif != NULL);
836 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
837 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
838 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
839 		mutex_exit(&vifp->v_lock);
840 		return (B_FALSE);
841 	}
842 	ipif_refhold_locked(vifp->v_ipif);
843 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
844 	vifp->v_refcnt++;
845 	mutex_exit(&vifp->v_lock);
846 	return (B_TRUE);
847 }
848 
849 /*
850  * Add a vif to the vif table.
851  */
852 static int
853 add_vif(struct vifctl *vifcp, queue_t *q, mblk_t *first_mp, ip_stack_t *ipst)
854 {
855 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
856 	ipif_t		*ipif;
857 	int		error;
858 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
859 	conn_t   	*connp = Q_TO_CONN(q);
860 	ipsq_t  	*ipsq;
861 
862 	ASSERT(connp != NULL);
863 
864 	if (vifcp->vifc_vifi >= MAXVIFS)
865 		return (EINVAL);
866 
867 	if (is_mrouter_off(ipst))
868 		return (EINVAL);
869 
870 	mutex_enter(&vifp->v_lock);
871 	/*
872 	 * Viftable entry should be 0.
873 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
874 	 * initialized.
875 	 *
876 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
877 	 * request while the delete is in progress, mrouted only sends add
878 	 * requests when a new interface is added and the new interface cannot
879 	 * have the same vifi as an existing interface. We make sure that
880 	 * ill_delete will block till the vif is deleted by adding a refcnt
881 	 * to ipif in del_vif().
882 	 */
883 	if (vifp->v_lcl_addr.s_addr != 0 ||
884 	    vifp->v_marks != 0 ||
885 	    vifp->v_refcnt != 0) {
886 		mutex_exit(&vifp->v_lock);
887 		return (EADDRINUSE);
888 	}
889 
890 	/* Incoming vif should not be 0 */
891 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
892 		mutex_exit(&vifp->v_lock);
893 		return (EINVAL);
894 	}
895 
896 	vifp->v_refcnt++;
897 	mutex_exit(&vifp->v_lock);
898 	/* Find the interface with the local address */
899 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
900 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
901 	    ip_restart_optmgmt, &error, ipst);
902 	if (ipif == NULL) {
903 		VIF_REFRELE(vifp);
904 		if (error == EINPROGRESS)
905 			return (error);
906 		return (EADDRNOTAVAIL);
907 	}
908 
909 	/*
910 	 * We have to be exclusive as we have to call ip_addmulti()
911 	 * This is the best position to try to be exclusive in case
912 	 * we have to wait.
913 	 */
914 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
915 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
916 	if ((ipsq) == NULL) {
917 		VIF_REFRELE(vifp);
918 		ipif_refrele(ipif);
919 		return (EINPROGRESS);
920 	}
921 
922 	if (ipst->ips_ip_mrtdebug > 1) {
923 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
924 		    "add_vif: src 0x%x enter",
925 		    vifcp->vifc_lcl_addr.s_addr);
926 	}
927 
928 	mutex_enter(&vifp->v_lock);
929 	/*
930 	 * Always clear cache when vifs change.
931 	 * Needed to ensure that src isn't left over from before vif was added.
932 	 * No need to get last_encap_lock, since we are running as a writer.
933 	 */
934 
935 	mutex_enter(&ipst->ips_last_encap_lock);
936 	ipst->ips_last_encap_src = 0;
937 	ipst->ips_last_encap_vif = NULL;
938 	mutex_exit(&ipst->ips_last_encap_lock);
939 
940 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
941 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
942 			cmn_err(CE_WARN,
943 			    "add_vif: source route tunnels not supported\n");
944 			VIF_REFRELE_LOCKED(vifp);
945 			ipif_refrele(ipif);
946 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
947 			return (EOPNOTSUPP);
948 		}
949 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
950 
951 	} else {
952 		/* Phyint or Register vif */
953 		if (vifcp->vifc_flags & VIFF_REGISTER) {
954 			/*
955 			 * Note: Since all IPPROTO_IP level options (including
956 			 * MRT_ADD_VIF) are done exclusively via
957 			 * ip_optmgmt_writer(), a lock is not necessary to
958 			 * protect reg_vif_num.
959 			 */
960 			mutex_enter(&ipst->ips_numvifs_mutex);
961 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
962 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
963 				mutex_exit(&ipst->ips_numvifs_mutex);
964 			} else {
965 				mutex_exit(&ipst->ips_numvifs_mutex);
966 				VIF_REFRELE_LOCKED(vifp);
967 				ipif_refrele(ipif);
968 				ipsq_exit(ipsq, B_TRUE, B_TRUE);
969 				return (EADDRINUSE);
970 			}
971 		}
972 
973 		/* Make sure the interface supports multicast */
974 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
975 			VIF_REFRELE_LOCKED(vifp);
976 			ipif_refrele(ipif);
977 			if (vifcp->vifc_flags & VIFF_REGISTER) {
978 				mutex_enter(&ipst->ips_numvifs_mutex);
979 				ipst->ips_reg_vif_num = ALL_VIFS;
980 				mutex_exit(&ipst->ips_numvifs_mutex);
981 			}
982 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
983 			return (EOPNOTSUPP);
984 		}
985 		/* Enable promiscuous reception of all IP mcasts from the if */
986 		mutex_exit(&vifp->v_lock);
987 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
988 		    MODE_IS_EXCLUDE, NULL);
989 		mutex_enter(&vifp->v_lock);
990 		/*
991 		 * since we released the lock lets make sure that
992 		 * ip_mrouter_done() has not been called.
993 		 */
994 		if (error != 0 || is_mrouter_off(ipst)) {
995 			if (error == 0)
996 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
997 				    B_TRUE);
998 			if (vifcp->vifc_flags & VIFF_REGISTER) {
999 				mutex_enter(&ipst->ips_numvifs_mutex);
1000 				ipst->ips_reg_vif_num = ALL_VIFS;
1001 				mutex_exit(&ipst->ips_numvifs_mutex);
1002 			}
1003 			VIF_REFRELE_LOCKED(vifp);
1004 			ipif_refrele(ipif);
1005 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1006 			return (error?error:EINVAL);
1007 		}
1008 	}
1009 	/* Define parameters for the tbf structure */
1010 	vifp->v_tbf = v_tbf;
1011 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
1012 	vifp->v_tbf->tbf_n_tok = 0;
1013 	vifp->v_tbf->tbf_q_len = 0;
1014 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1015 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1016 
1017 	vifp->v_flags = vifcp->vifc_flags;
1018 	vifp->v_threshold = vifcp->vifc_threshold;
1019 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1020 	vifp->v_ipif = ipif;
1021 	ipif_refrele(ipif);
1022 	/* Scaling up here, allows division by 1024 in critical code.	*/
1023 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1024 	vifp->v_timeout_id = 0;
1025 	/* initialize per vif pkt counters */
1026 	vifp->v_pkt_in = 0;
1027 	vifp->v_pkt_out = 0;
1028 	vifp->v_bytes_in = 0;
1029 	vifp->v_bytes_out = 0;
1030 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1031 
1032 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1033 	mutex_enter(&ipst->ips_numvifs_mutex);
1034 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1035 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1036 	mutex_exit(&ipst->ips_numvifs_mutex);
1037 
1038 	if (ipst->ips_ip_mrtdebug > 1) {
1039 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1040 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1041 		    vifcp->vifc_vifi,
1042 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1043 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1044 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1045 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1046 	}
1047 
1048 	vifp->v_marks = VIF_MARK_GOOD;
1049 	mutex_exit(&vifp->v_lock);
1050 	ipsq_exit(ipsq, B_TRUE, B_TRUE);
1051 	return (0);
1052 }
1053 
1054 
1055 /* Delete a vif from the vif table. */
1056 static void
1057 del_vifp(struct vif *vifp)
1058 {
1059 	struct tbf	*t = vifp->v_tbf;
1060 	mblk_t  *mp0;
1061 	vifi_t  vifi;
1062 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1063 
1064 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1065 	ASSERT(t != NULL);
1066 
1067 	/*
1068 	 * release the ref we put in vif_del.
1069 	 */
1070 	ASSERT(vifp->v_ipif != NULL);
1071 	ipif_refrele(vifp->v_ipif);
1072 
1073 	if (ipst->ips_ip_mrtdebug > 1) {
1074 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1075 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1076 	}
1077 
1078 	if (vifp->v_timeout_id != 0) {
1079 		(void) untimeout(vifp->v_timeout_id);
1080 		vifp->v_timeout_id = 0;
1081 	}
1082 
1083 	/*
1084 	 * Free packets queued at the interface.
1085 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1086 	 */
1087 	mutex_enter(&t->tbf_lock);
1088 	while (t->tbf_q != NULL) {
1089 		mp0 = t->tbf_q;
1090 		t->tbf_q = t->tbf_q->b_next;
1091 		mp0->b_prev = mp0->b_next = NULL;
1092 		freemsg(mp0);
1093 	}
1094 	mutex_exit(&t->tbf_lock);
1095 
1096 	/*
1097 	 * Always clear cache when vifs change.
1098 	 * No need to get last_encap_lock since we are running as a writer.
1099 	 */
1100 	mutex_enter(&ipst->ips_last_encap_lock);
1101 	if (vifp == ipst->ips_last_encap_vif) {
1102 		ipst->ips_last_encap_vif = NULL;
1103 		ipst->ips_last_encap_src = 0;
1104 	}
1105 	mutex_exit(&ipst->ips_last_encap_lock);
1106 
1107 	mutex_destroy(&t->tbf_lock);
1108 
1109 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1110 
1111 	/* Adjust numvifs down */
1112 	mutex_enter(&ipst->ips_numvifs_mutex);
1113 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1114 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1115 			break;
1116 	ipst->ips_numvifs = vifi;
1117 	mutex_exit(&ipst->ips_numvifs_mutex);
1118 
1119 	bzero(vifp, sizeof (*vifp));
1120 }
1121 
1122 static int
1123 del_vif(vifi_t *vifip, queue_t *q, mblk_t *first_mp, ip_stack_t *ipst)
1124 {
1125 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1126 	conn_t		*connp;
1127 	ipsq_t  	*ipsq;
1128 
1129 	if (*vifip >= ipst->ips_numvifs)
1130 		return (EINVAL);
1131 
1132 
1133 	mutex_enter(&vifp->v_lock);
1134 	/*
1135 	 * Not initialized
1136 	 * Here we are not looking at the vif that is being initialized
1137 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1138 	 */
1139 	if (vifp->v_lcl_addr.s_addr == 0 ||
1140 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1141 		mutex_exit(&vifp->v_lock);
1142 		return (EADDRNOTAVAIL);
1143 	}
1144 
1145 	/*
1146 	 * This is an optimization, if first_mp == NULL
1147 	 * than we are being called from reset_mrt_vif_ipif()
1148 	 * so we already have exclusive access to the ipsq.
1149 	 * the ASSERT below is a check for this condition.
1150 	 */
1151 	if (first_mp != NULL &&
1152 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1153 		connp = Q_TO_CONN(q);
1154 		ASSERT(connp != NULL);
1155 		/*
1156 		 * We have to be exclusive as we have to call ip_delmulti()
1157 		 * This is the best position to try to be exclusive in case
1158 		 * we have to wait.
1159 		 */
1160 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
1161 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
1162 		if ((ipsq) == NULL) {
1163 			mutex_exit(&vifp->v_lock);
1164 			return (EINPROGRESS);
1165 		}
1166 		/* recheck after being exclusive */
1167 		if (vifp->v_lcl_addr.s_addr == 0 ||
1168 		    !vifp->v_marks & VIF_MARK_GOOD) {
1169 			/*
1170 			 * someone beat us.
1171 			 */
1172 			mutex_exit(&vifp->v_lock);
1173 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1174 			return (EADDRNOTAVAIL);
1175 		}
1176 	}
1177 
1178 
1179 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
1180 
1181 
1182 	/*
1183 	 * add a refhold so that ipif does not go away while
1184 	 * there are still users, this will be released in del_vifp
1185 	 * when we free the vif.
1186 	 */
1187 	ipif_refhold(vifp->v_ipif);
1188 
1189 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1190 	vifp->v_marks &= ~VIF_MARK_GOOD;
1191 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1192 
1193 	/* Phyint only */
1194 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1195 		ipif_t *ipif = vifp->v_ipif;
1196 		ASSERT(ipif != NULL);
1197 		/*
1198 		 * should be OK to drop the lock as we
1199 		 * have marked this as CONDEMNED.
1200 		 */
1201 		mutex_exit(&(vifp)->v_lock);
1202 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
1203 		if (first_mp != NULL)
1204 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1205 		mutex_enter(&(vifp)->v_lock);
1206 	}
1207 
1208 	/*
1209 	 * decreases the refcnt added in add_vif.
1210 	 */
1211 	VIF_REFRELE_LOCKED(vifp);
1212 	return (0);
1213 }
1214 
1215 /*
1216  * Add an mfc entry.
1217  */
1218 static int
1219 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1220 {
1221 	struct mfc *rt;
1222 	struct rtdetq *rte;
1223 	ushort_t nstl;
1224 	int i;
1225 	struct mfcb *mfcbp;
1226 
1227 	/*
1228 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1229 	 * did not have a real route for pkt.
1230 	 * We want this pkt without rt installed in the mfctable to prevent
1231 	 * multiiple tries, so go ahead and put it in mfctable, it will
1232 	 * be discarded later in ip_mdq() because the child is NULL.
1233 	 */
1234 
1235 	/* Error checking, out of bounds? */
1236 	if (mfccp->mfcc_parent > MAXVIFS) {
1237 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1238 		    (int)mfccp->mfcc_parent));
1239 		return (EINVAL);
1240 	}
1241 
1242 	if ((mfccp->mfcc_parent != NO_VIF) &&
1243 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1244 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1245 		    (int)mfccp->mfcc_parent));
1246 		return (EINVAL);
1247 	}
1248 
1249 	if (is_mrouter_off(ipst)) {
1250 		return (EINVAL);
1251 	}
1252 
1253 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1254 	    mfccp->mfcc_mcastgrp.s_addr)];
1255 	MFCB_REFHOLD(mfcbp);
1256 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1257 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1258 
1259 	/* If an entry already exists, just update the fields */
1260 	if (rt) {
1261 		if (ipst->ips_ip_mrtdebug > 1) {
1262 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1263 			    "add_mfc: update o %x grp %x parent %x",
1264 			    ntohl(mfccp->mfcc_origin.s_addr),
1265 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1266 			    mfccp->mfcc_parent);
1267 		}
1268 		mutex_enter(&rt->mfc_mutex);
1269 		rt->mfc_parent = mfccp->mfcc_parent;
1270 
1271 		mutex_enter(&ipst->ips_numvifs_mutex);
1272 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1273 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1274 		mutex_exit(&ipst->ips_numvifs_mutex);
1275 		mutex_exit(&rt->mfc_mutex);
1276 
1277 		MFCB_REFRELE(mfcbp);
1278 		return (0);
1279 	}
1280 
1281 	/*
1282 	 * Find the entry for which the upcall was made and update.
1283 	 */
1284 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1285 		mutex_enter(&rt->mfc_mutex);
1286 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1287 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1288 		    (rt->mfc_rte != NULL) &&
1289 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1290 			if (nstl++ != 0)
1291 				cmn_err(CE_WARN,
1292 				    "add_mfc: %s o %x g %x p %x",
1293 				    "multiple kernel entries",
1294 				    ntohl(mfccp->mfcc_origin.s_addr),
1295 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1296 				    mfccp->mfcc_parent);
1297 
1298 			if (ipst->ips_ip_mrtdebug > 1) {
1299 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
1300 				    SL_TRACE,
1301 				    "add_mfc: o %x g %x p %x",
1302 				    ntohl(mfccp->mfcc_origin.s_addr),
1303 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1304 				    mfccp->mfcc_parent);
1305 			}
1306 			fill_route(rt, mfccp, ipst);
1307 
1308 			/*
1309 			 * Prevent cleanup of cache entry.
1310 			 * Timer starts in ip_mforward.
1311 			 */
1312 			if (rt->mfc_timeout_id != 0) {
1313 				timeout_id_t id;
1314 				id = rt->mfc_timeout_id;
1315 				/*
1316 				 * setting id to zero will avoid this
1317 				 * entry from being cleaned up in
1318 				 * expire_up_calls().
1319 				 */
1320 				rt->mfc_timeout_id = 0;
1321 				/*
1322 				 * dropping the lock is fine as we
1323 				 * have a refhold on the bucket.
1324 				 * so mfc cannot be freed.
1325 				 * The timeout can fire but it will see
1326 				 * that mfc_timeout_id == 0 and not cleanup.
1327 				 */
1328 				mutex_exit(&rt->mfc_mutex);
1329 				(void) untimeout(id);
1330 				mutex_enter(&rt->mfc_mutex);
1331 			}
1332 
1333 			/*
1334 			 * Send all pkts that are queued waiting for the upcall.
1335 			 * ip_mdq param tun set to 0 -
1336 			 * the return value of ip_mdq() isn't used here,
1337 			 * so value we send doesn't matter.
1338 			 */
1339 			while (rt->mfc_rte != NULL) {
1340 				rte = rt->mfc_rte;
1341 				rt->mfc_rte = rte->rte_next;
1342 				mutex_exit(&rt->mfc_mutex);
1343 				(void) ip_mdq(rte->mp, (ipha_t *)
1344 				    rte->mp->b_rptr, rte->ill, 0, rt);
1345 				freemsg(rte->mp);
1346 				mi_free((char *)rte);
1347 				mutex_enter(&rt->mfc_mutex);
1348 			}
1349 		}
1350 		mutex_exit(&rt->mfc_mutex);
1351 	}
1352 
1353 
1354 	/*
1355 	 * It is possible that an entry is being inserted without an upcall
1356 	 */
1357 	if (nstl == 0) {
1358 		mutex_enter(&(mfcbp->mfcb_lock));
1359 		if (ipst->ips_ip_mrtdebug > 1) {
1360 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1361 			    "add_mfc: no upcall o %x g %x p %x",
1362 			    ntohl(mfccp->mfcc_origin.s_addr),
1363 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1364 			    mfccp->mfcc_parent);
1365 		}
1366 		if (is_mrouter_off(ipst)) {
1367 			mutex_exit(&mfcbp->mfcb_lock);
1368 			MFCB_REFRELE(mfcbp);
1369 			return (EINVAL);
1370 		}
1371 
1372 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1373 
1374 			mutex_enter(&rt->mfc_mutex);
1375 			if ((rt->mfc_origin.s_addr ==
1376 			    mfccp->mfcc_origin.s_addr) &&
1377 			    (rt->mfc_mcastgrp.s_addr ==
1378 				mfccp->mfcc_mcastgrp.s_addr) &&
1379 				(!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1380 				fill_route(rt, mfccp, ipst);
1381 				mutex_exit(&rt->mfc_mutex);
1382 				break;
1383 			}
1384 			mutex_exit(&rt->mfc_mutex);
1385 		}
1386 
1387 		/* No upcall, so make a new entry into mfctable */
1388 		if (rt == NULL) {
1389 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1390 			if (rt == NULL) {
1391 				ip1dbg(("add_mfc: out of memory\n"));
1392 				mutex_exit(&mfcbp->mfcb_lock);
1393 				MFCB_REFRELE(mfcbp);
1394 				return (ENOBUFS);
1395 			}
1396 
1397 			/* Insert new entry at head of hash chain */
1398 			mutex_enter(&rt->mfc_mutex);
1399 			fill_route(rt, mfccp, ipst);
1400 
1401 			/* Link into table */
1402 			rt->mfc_next   = mfcbp->mfcb_mfc;
1403 			mfcbp->mfcb_mfc = rt;
1404 			mutex_exit(&rt->mfc_mutex);
1405 		}
1406 		mutex_exit(&mfcbp->mfcb_lock);
1407 	}
1408 
1409 	MFCB_REFRELE(mfcbp);
1410 	return (0);
1411 }
1412 
1413 /*
1414  * Fills in mfc structure from mrouted mfcctl.
1415  */
1416 static void
1417 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1418 {
1419 	int i;
1420 
1421 	rt->mfc_origin		= mfccp->mfcc_origin;
1422 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1423 	rt->mfc_parent		= mfccp->mfcc_parent;
1424 	mutex_enter(&ipst->ips_numvifs_mutex);
1425 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1426 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1427 	}
1428 	mutex_exit(&ipst->ips_numvifs_mutex);
1429 	/* Initialize pkt counters per src-grp */
1430 	rt->mfc_pkt_cnt	= 0;
1431 	rt->mfc_byte_cnt	= 0;
1432 	rt->mfc_wrong_if	= 0;
1433 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1434 
1435 }
1436 
1437 static void
1438 free_queue(struct mfc *mfcp)
1439 {
1440 	struct rtdetq *rte0;
1441 
1442 	/*
1443 	 * Drop all queued upcall packets.
1444 	 * Free the mbuf with the pkt.
1445 	 */
1446 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1447 		mfcp->mfc_rte = rte0->rte_next;
1448 		freemsg(rte0->mp);
1449 		mi_free((char *)rte0);
1450 	}
1451 }
1452 /*
1453  * go thorugh the hash bucket and free all the entries marked condemned.
1454  */
1455 void
1456 release_mfc(struct mfcb *mfcbp)
1457 {
1458 	struct mfc *current_mfcp;
1459 	struct mfc *prev_mfcp;
1460 
1461 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1462 
1463 	while (current_mfcp != NULL) {
1464 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1465 			if (current_mfcp == mfcbp->mfcb_mfc) {
1466 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1467 				free_queue(current_mfcp);
1468 				mi_free(current_mfcp);
1469 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1470 				continue;
1471 			}
1472 			ASSERT(prev_mfcp != NULL);
1473 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1474 			free_queue(current_mfcp);
1475 			mi_free(current_mfcp);
1476 			current_mfcp = NULL;
1477 		} else {
1478 			prev_mfcp = current_mfcp;
1479 		}
1480 
1481 		current_mfcp = prev_mfcp->mfc_next;
1482 
1483 	}
1484 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1485 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1486 }
1487 
1488 /*
1489  * Delete an mfc entry.
1490  */
1491 static int
1492 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1493 {
1494 	struct in_addr	origin;
1495 	struct in_addr	mcastgrp;
1496 	struct mfc 		*rt;
1497 	uint_t			hash;
1498 
1499 	origin = mfccp->mfcc_origin;
1500 	mcastgrp = mfccp->mfcc_mcastgrp;
1501 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1502 
1503 	if (ipst->ips_ip_mrtdebug > 1) {
1504 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1505 		    "del_mfc: o %x g %x",
1506 		    ntohl(origin.s_addr),
1507 		    ntohl(mcastgrp.s_addr));
1508 	}
1509 
1510 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1511 
1512 	/* Find mfc in mfctable, finds only entries without upcalls */
1513 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1514 		mutex_enter(&rt->mfc_mutex);
1515 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1516 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1517 		    rt->mfc_rte == NULL &&
1518 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1519 			break;
1520 		mutex_exit(&rt->mfc_mutex);
1521 	}
1522 
1523 	/*
1524 	 * Return if there was an upcall (mfc_rte != NULL,
1525 	 * or rt not in mfctable.
1526 	 */
1527 	if (rt == NULL) {
1528 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1529 		return (EADDRNOTAVAIL);
1530 	}
1531 
1532 
1533 	/*
1534 	 * no need to hold lock as we have a reference.
1535 	 */
1536 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1537 	/* error checking */
1538 	if (rt->mfc_timeout_id != 0) {
1539 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1540 		/*
1541 		 * Its ok to drop the lock,  the struct cannot be freed
1542 		 * since we have a ref on the hash bucket.
1543 		 */
1544 		rt->mfc_timeout_id = 0;
1545 		mutex_exit(&rt->mfc_mutex);
1546 		(void) untimeout(rt->mfc_timeout_id);
1547 		mutex_enter(&rt->mfc_mutex);
1548 	}
1549 
1550 	ASSERT(rt->mfc_rte == NULL);
1551 
1552 
1553 	/*
1554 	 * Delete the entry from the cache
1555 	 */
1556 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1557 	mutex_exit(&rt->mfc_mutex);
1558 
1559 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1560 
1561 	return (0);
1562 }
1563 
1564 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1565 
1566 /*
1567  * IP multicast forwarding function. This function assumes that the packet
1568  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1569  * pointed to by "ill", and the packet is to be relayed to other networks
1570  * that have members of the packet's destination IP multicast group.
1571  *
1572  * The packet is returned unscathed to the caller, unless it is
1573  * erroneous, in which case a -1 value tells the caller (IP)
1574  * to discard it.
1575  *
1576  * Unlike BSD, SunOS 5.x needs to return to IP info about
1577  * whether pkt came in thru a tunnel, so it can be discarded, unless
1578  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1579  * to be delivered.
1580  * Return values are 0 - pkt is okay and phyint
1581  *		    -1 - pkt is malformed and to be tossed
1582  *                   1 - pkt came in on tunnel
1583  */
1584 int
1585 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
1586 {
1587 	struct mfc 	*rt;
1588 	ipaddr_t	src, dst, tunnel_src = 0;
1589 	static int	srctun = 0;
1590 	vifi_t		vifi;
1591 	boolean_t	pim_reg_packet = B_FALSE;
1592 	struct mfcb *mfcbp;
1593 	ip_stack_t	*ipst = ill->ill_ipst;
1594 
1595 	if (ipst->ips_ip_mrtdebug > 1) {
1596 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1597 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1598 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1599 		    ill->ill_name);
1600 	}
1601 
1602 	dst = ipha->ipha_dst;
1603 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
1604 		pim_reg_packet = B_TRUE;
1605 	else
1606 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
1607 
1608 	/*
1609 	 * Don't forward a packet with time-to-live of zero or one,
1610 	 * or a packet destined to a local-only group.
1611 	 */
1612 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1613 			(ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1614 		if (ipst->ips_ip_mrtdebug > 1) {
1615 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1616 			    "ip_mforward: not forwarded ttl %d,"
1617 			    " dst 0x%x ill %s",
1618 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1619 		}
1620 		mp->b_prev = NULL;
1621 		if (tunnel_src != 0)
1622 			return (1);
1623 		else
1624 			return (0);
1625 	}
1626 
1627 	if ((tunnel_src != 0) || pim_reg_packet) {
1628 		/*
1629 		 * Packet arrived over an encapsulated tunnel or via a PIM
1630 		 * register message. Both ip_mroute_decap() and pim_input()
1631 		 * encode information in mp->b_prev.
1632 		 */
1633 		mp->b_prev = NULL;
1634 		if (ipst->ips_ip_mrtdebug > 1) {
1635 			if (tunnel_src != 0) {
1636 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
1637 				    SL_TRACE,
1638 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1639 				    ill->ill_name);
1640 			} else if (pim_reg_packet) {
1641 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
1642 				    SL_TRACE,
1643 				    "ip_mforward: ill %s arrived via"
1644 				    "  REGISTER VIF",
1645 				    ill->ill_name);
1646 			}
1647 		}
1648 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1649 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1650 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1651 		/* Packet arrived via a physical interface. */
1652 		if (ipst->ips_ip_mrtdebug > 1) {
1653 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1654 			    "ip_mforward: ill %s arrived via PHYINT",
1655 			    ill->ill_name);
1656 		}
1657 
1658 	} else {
1659 		/*
1660 		 * Packet arrived through a SRCRT tunnel.
1661 		 * Source-route tunnels are no longer supported.
1662 		 * Error message printed every 1000 times.
1663 		 */
1664 		if ((srctun++ % 1000) == 0) {
1665 			cmn_err(CE_WARN,
1666 			    "ip_mforward: received source-routed pkt from %x",
1667 			    ntohl(ipha->ipha_src));
1668 		}
1669 		return (-1);
1670 	}
1671 
1672 	ipst->ips_mrtstat->mrts_fwd_in++;
1673 	src = ipha->ipha_src;
1674 
1675 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1676 
1677 	/*
1678 	 * Lock the mfctable against changes made by ip_mforward.
1679 	 * Note that only add_mfc and del_mfc can remove entries and
1680 	 * they run with exclusive access to IP. So we do not need to
1681 	 * guard against the rt being deleted, so release lock after reading.
1682 	 */
1683 
1684 	if (is_mrouter_off(ipst))
1685 		return (-1);
1686 
1687 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1688 	MFCB_REFHOLD(mfcbp);
1689 	MFCFIND(mfcbp, src, dst, rt);
1690 
1691 	/* Entry exists, so forward if necessary */
1692 	if (rt != NULL) {
1693 		int ret = 0;
1694 		ipst->ips_mrtstat->mrts_mfc_hits++;
1695 		if (pim_reg_packet) {
1696 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1697 			ret = ip_mdq(mp, ipha,
1698 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1699 			    v_ipif->ipif_ill,
1700 			    0, rt);
1701 		} else {
1702 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1703 		}
1704 
1705 		MFCB_REFRELE(mfcbp);
1706 		return (ret);
1707 
1708 		/*
1709 		 * Don't forward if we don't have a cache entry.  Mrouted will
1710 		 * always provide a cache entry in response to an upcall.
1711 		 */
1712 	} else {
1713 		/*
1714 		 * If we don't have a route for packet's origin, make a copy
1715 		 * of the packet and send message to routing daemon.
1716 		 */
1717 		struct mfc	*mfc_rt	 = NULL;
1718 		mblk_t		*mp0	 = NULL;
1719 		mblk_t		*mp_copy = NULL;
1720 		struct rtdetq	*rte	 = NULL;
1721 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1722 		uint_t		hash;
1723 		int		npkts;
1724 		boolean_t	new_mfc = B_FALSE;
1725 		ipst->ips_mrtstat->mrts_mfc_misses++;
1726 		/* BSD uses mrts_no_route++ */
1727 		if (ipst->ips_ip_mrtdebug > 1) {
1728 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1729 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1730 			    ill->ill_name, ntohl(src), ntohl(dst),
1731 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1732 		}
1733 		/*
1734 		 * The order of the following code differs from the BSD code.
1735 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1736 		 * code works, so SunOS 5.x wasn't changed to conform to the
1737 		 * BSD version.
1738 		 */
1739 
1740 		/* Lock mfctable. */
1741 		hash = MFCHASH(src, dst);
1742 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1743 
1744 		/*
1745 		 * If we are turning off mrouted return an error
1746 		 */
1747 		if (is_mrouter_off(ipst)) {
1748 			mutex_exit(&mfcbp->mfcb_lock);
1749 			MFCB_REFRELE(mfcbp);
1750 			return (-1);
1751 		}
1752 
1753 		/* Is there an upcall waiting for this packet? */
1754 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1755 		    mfc_rt = mfc_rt->mfc_next) {
1756 			mutex_enter(&mfc_rt->mfc_mutex);
1757 			if (ipst->ips_ip_mrtdebug > 1) {
1758 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
1759 				    SL_TRACE,
1760 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1761 				    " g 0x%x\n",
1762 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1763 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1764 			}
1765 			/* There is an upcall */
1766 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1767 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1768 			    (mfc_rt->mfc_rte != NULL) &&
1769 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1770 				break;
1771 			}
1772 			mutex_exit(&mfc_rt->mfc_mutex);
1773 		}
1774 		/* No upcall, so make a new entry into mfctable */
1775 		if (mfc_rt == NULL) {
1776 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1777 			if (mfc_rt == NULL) {
1778 				ipst->ips_mrtstat->mrts_fwd_drop++;
1779 				ip1dbg(("ip_mforward: out of memory "
1780 				    "for mfc, mfc_rt\n"));
1781 				goto error_return;
1782 			} else
1783 				new_mfc = B_TRUE;
1784 			/* Get resources */
1785 			/* TODO could copy header and dup rest */
1786 			mp_copy = copymsg(mp);
1787 			if (mp_copy == NULL) {
1788 				ipst->ips_mrtstat->mrts_fwd_drop++;
1789 				ip1dbg(("ip_mforward: out of memory for "
1790 				    "mblk, mp_copy\n"));
1791 				goto error_return;
1792 			}
1793 			mutex_enter(&mfc_rt->mfc_mutex);
1794 		}
1795 		/* Get resources for rte, whether first rte or not first. */
1796 		/* Add this packet into rtdetq */
1797 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1798 		if (rte == NULL) {
1799 			ipst->ips_mrtstat->mrts_fwd_drop++;
1800 			mutex_exit(&mfc_rt->mfc_mutex);
1801 			ip1dbg(("ip_mforward: out of memory for"
1802 			    " rtdetq, rte\n"));
1803 			goto error_return;
1804 		}
1805 
1806 		mp0 = copymsg(mp);
1807 		if (mp0 == NULL) {
1808 			ipst->ips_mrtstat->mrts_fwd_drop++;
1809 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1810 			mutex_exit(&mfc_rt->mfc_mutex);
1811 			goto error_return;
1812 		}
1813 		rte->mp		= mp0;
1814 		if (pim_reg_packet) {
1815 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1816 			rte->ill =
1817 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1818 			    v_ipif->ipif_ill;
1819 		} else {
1820 			rte->ill = ill;
1821 		}
1822 		rte->rte_next	= NULL;
1823 
1824 		/*
1825 		 * Determine if upcall q (rtdetq) has overflowed.
1826 		 * mfc_rt->mfc_rte is null by mi_zalloc
1827 		 * if it is the first message.
1828 		 */
1829 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1830 		    rte_m = rte_m->rte_next)
1831 			npkts++;
1832 		if (ipst->ips_ip_mrtdebug > 1) {
1833 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1834 			    "ip_mforward: upcalls %d\n", npkts);
1835 		}
1836 		if (npkts > MAX_UPQ) {
1837 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1838 			mutex_exit(&mfc_rt->mfc_mutex);
1839 			goto error_return;
1840 		}
1841 
1842 		if (npkts == 0) {	/* first upcall */
1843 			int i = 0;
1844 			/*
1845 			 * Now finish installing the new mfc! Now that we have
1846 			 * resources!  Insert new entry at head of hash chain.
1847 			 * Use src and dst which are ipaddr_t's.
1848 			 */
1849 			mfc_rt->mfc_origin.s_addr = src;
1850 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1851 
1852 			mutex_enter(&ipst->ips_numvifs_mutex);
1853 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1854 				mfc_rt->mfc_ttls[i] = 0;
1855 			mutex_exit(&ipst->ips_numvifs_mutex);
1856 			mfc_rt->mfc_parent = ALL_VIFS;
1857 
1858 			/* Link into table */
1859 			if (ipst->ips_ip_mrtdebug > 1) {
1860 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
1861 				    SL_TRACE,
1862 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1863 				    "g 0x%x\n", hash,
1864 				    ntohl(mfc_rt->mfc_origin.s_addr),
1865 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1866 			}
1867 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1868 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1869 			mfc_rt->mfc_rte = NULL;
1870 		}
1871 
1872 		/* Link in the upcall */
1873 		/* First upcall */
1874 		if (mfc_rt->mfc_rte == NULL)
1875 			mfc_rt->mfc_rte = rte;
1876 		else {
1877 			/* not the first upcall */
1878 			prev_rte = mfc_rt->mfc_rte;
1879 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1880 			    prev_rte = rte1, rte1 = rte1->rte_next);
1881 			prev_rte->rte_next = rte;
1882 		}
1883 
1884 		/*
1885 		 * No upcalls waiting, this is first one, so send a message to
1886 		 * routing daemon to install a route into kernel table.
1887 		 */
1888 		if (npkts == 0) {
1889 			struct igmpmsg	*im;
1890 			/* ipha_protocol is 0, for upcall */
1891 			ASSERT(mp_copy != NULL);
1892 			im = (struct igmpmsg *)mp_copy->b_rptr;
1893 			im->im_msgtype	= IGMPMSG_NOCACHE;
1894 			im->im_mbz = 0;
1895 			mutex_enter(&ipst->ips_numvifs_mutex);
1896 			if (pim_reg_packet) {
1897 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1898 				mutex_exit(&ipst->ips_numvifs_mutex);
1899 			} else {
1900 				/*
1901 				 * XXX do we need to hold locks here ?
1902 				 */
1903 				for (vifi = 0;
1904 				    vifi < ipst->ips_numvifs;
1905 				    vifi++) {
1906 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1907 						continue;
1908 					if (ipst->ips_vifs[vifi].
1909 					    v_ipif->ipif_ill == ill) {
1910 						im->im_vif = (uchar_t)vifi;
1911 						break;
1912 					}
1913 				}
1914 				mutex_exit(&ipst->ips_numvifs_mutex);
1915 				ASSERT(vifi < ipst->ips_numvifs);
1916 			}
1917 
1918 			ipst->ips_mrtstat->mrts_upcalls++;
1919 			/* Timer to discard upcalls if mrouted is too slow */
1920 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1921 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1922 			mutex_exit(&mfc_rt->mfc_mutex);
1923 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1924 			putnext(RD(ipst->ips_ip_g_mrouter), mp_copy);
1925 
1926 		} else {
1927 			mutex_exit(&mfc_rt->mfc_mutex);
1928 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1929 			freemsg(mp_copy);
1930 		}
1931 
1932 		MFCB_REFRELE(mfcbp);
1933 		if (tunnel_src != 0)
1934 			return (1);
1935 		else
1936 			return (0);
1937 	error_return:
1938 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1939 		MFCB_REFRELE(mfcbp);
1940 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1941 			mi_free((char *)mfc_rt);
1942 		if (rte != NULL)
1943 			mi_free((char *)rte);
1944 		if (mp_copy != NULL)
1945 			freemsg(mp_copy);
1946 		if (mp0 != NULL)
1947 			freemsg(mp0);
1948 		return (-1);
1949 	}
1950 }
1951 
1952 /*
1953  * Clean up the mfctable cache entry if upcall is not serviced.
1954  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1955  */
1956 static void
1957 expire_upcalls(void *arg)
1958 {
1959 	struct mfc *mfc_rt = arg;
1960 	uint_t hash;
1961 	struct mfc *prev_mfc, *mfc0;
1962 	ip_stack_t	*ipst;
1963 
1964 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1965 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1966 		return;
1967 	}
1968 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1969 
1970 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1971 	if (ipst->ips_ip_mrtdebug > 1) {
1972 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
1973 		    "expire_upcalls: hash %d s %x g %x",
1974 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1975 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1976 	}
1977 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1978 	mutex_enter(&mfc_rt->mfc_mutex);
1979 	/*
1980 	 * if timeout has been set to zero, than the
1981 	 * entry has been filled, no need to delete it.
1982 	 */
1983 	if (mfc_rt->mfc_timeout_id == 0)
1984 		goto done;
1985 	ipst->ips_mrtstat->mrts_cache_cleanups++;
1986 	mfc_rt->mfc_timeout_id = 0;
1987 
1988 	/* Determine entry to be cleaned up in cache table. */
1989 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1990 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1991 		if (mfc0 == mfc_rt)
1992 			break;
1993 
1994 	/* del_mfc takes care of gone mfcs */
1995 	ASSERT(prev_mfc != NULL);
1996 	ASSERT(mfc0 != NULL);
1997 
1998 	/*
1999 	 * Delete the entry from the cache
2000 	 */
2001 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
2002 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
2003 
2004 	/*
2005 	 * release_mfc will drop all queued upcall packets.
2006 	 * and will free the mbuf with the pkt, if, timing info.
2007 	 */
2008 done:
2009 	mutex_exit(&mfc_rt->mfc_mutex);
2010 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
2011 }
2012 
2013 /*
2014  * Packet forwarding routine once entry in the cache is made.
2015  */
2016 static int
2017 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
2018     struct mfc *rt)
2019 {
2020 	vifi_t vifi;
2021 	struct vif *vifp;
2022 	ipaddr_t dst = ipha->ipha_dst;
2023 	size_t  plen = msgdsize(mp);
2024 	vifi_t num_of_vifs;
2025 	ip_stack_t	*ipst = ill->ill_ipst;
2026 
2027 	if (ipst->ips_ip_mrtdebug > 1) {
2028 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2029 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
2030 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
2031 		    ill->ill_name);
2032 	}
2033 
2034 	/* Macro to send packet on vif */
2035 #define	MC_SEND(ipha, mp, vifp, dst) { \
2036 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2037 		encap_send((ipha), (mp), (vifp), (dst)); \
2038 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2039 		register_send((ipha), (mp), (vifp), (dst)); \
2040 	else \
2041 		phyint_send((ipha), (mp), (vifp), (dst)); \
2042 }
2043 
2044 	vifi = rt->mfc_parent;
2045 
2046 	/*
2047 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2048 	 * Mrouted had no route.
2049 	 * We wanted the route installed in the mfctable to prevent multiple
2050 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2051 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2052 	 * 3.6.
2053 	 */
2054 	if (vifi == NO_VIF) {
2055 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2056 		    ill->ill_name));
2057 		if (ipst->ips_ip_mrtdebug > 1) {
2058 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2059 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2060 		}
2061 		return (-1);	/* drop pkt */
2062 	}
2063 
2064 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2065 		return (-1);
2066 	/*
2067 	 * The MFC entries are not cleaned up when an ipif goes
2068 	 * away thus this code has to guard against an MFC referencing
2069 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2070 	 * sets the v_ipif to NULL when the ipif disappears.
2071 	 */
2072 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2073 
2074 	if (vifi >= ipst->ips_numvifs) {
2075 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2076 		    "%d ill %s viftable ill %s\n",
2077 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2078 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2079 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2080 		return (-1);
2081 	}
2082 	/*
2083 	 * Don't forward if it didn't arrive from the parent vif for its
2084 	 * origin. But do match on the groups as we nominate only one
2085 	 * ill in the group for receiving allmulti packets.
2086 	 */
2087 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
2088 	    (ill->ill_group == NULL ||
2089 	    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
2090 		ill->ill_group)) ||
2091 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2092 		/* Came in the wrong interface */
2093 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2094 			"numvifs %d ill %s viftable ill %s\n",
2095 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2096 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2097 		if (ipst->ips_ip_mrtdebug > 1) {
2098 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2099 			    "ip_mdq: arrived wrong if, vifi %d ill "
2100 			    "%s viftable ill %s\n",
2101 			    (int)vifi, ill->ill_name,
2102 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2103 		}
2104 		ipst->ips_mrtstat->mrts_wrong_if++;
2105 		rt->mfc_wrong_if++;
2106 
2107 		/*
2108 		 * If we are doing PIM assert processing and we are forwarding
2109 		 * packets on this interface, and it is a broadcast medium
2110 		 * interface (and not a tunnel), send a message to the routing.
2111 		 *
2112 		 * We use the first ipif on the list, since it's all we have.
2113 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2114 		 */
2115 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2116 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2117 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2118 			mblk_t		*mp_copy;
2119 			struct igmpmsg	*im;
2120 
2121 			/* TODO could copy header and dup rest */
2122 			mp_copy = copymsg(mp);
2123 			if (mp_copy == NULL) {
2124 				ipst->ips_mrtstat->mrts_fwd_drop++;
2125 				ip1dbg(("ip_mdq: out of memory "
2126 				    "for mblk, mp_copy\n"));
2127 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2128 				return (-1);
2129 			}
2130 
2131 			im = (struct igmpmsg *)mp_copy->b_rptr;
2132 			im->im_msgtype = IGMPMSG_WRONGVIF;
2133 			im->im_mbz = 0;
2134 			im->im_vif = (ushort_t)vifi;
2135 			putnext(RD(ipst->ips_ip_g_mrouter), mp_copy);
2136 		}
2137 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2138 		if (tunnel_src != 0)
2139 			return (1);
2140 		else
2141 			return (0);
2142 	}
2143 	/*
2144 	 * If I sourced this packet, it counts as output, else it was input.
2145 	 */
2146 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2147 		ipst->ips_vifs[vifi].v_pkt_out++;
2148 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2149 	} else {
2150 		ipst->ips_vifs[vifi].v_pkt_in++;
2151 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2152 	}
2153 	mutex_enter(&rt->mfc_mutex);
2154 	rt->mfc_pkt_cnt++;
2155 	rt->mfc_byte_cnt += plen;
2156 	mutex_exit(&rt->mfc_mutex);
2157 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2158 	/*
2159 	 * For each vif, decide if a copy of the packet should be forwarded.
2160 	 * Forward if:
2161 	 *		- the vif threshold ttl is non-zero AND
2162 	 *		- the pkt ttl exceeds the vif's threshold
2163 	 * A non-zero mfc_ttl indicates that the vif is part of
2164 	 * the output set for the mfc entry.
2165 	 */
2166 	mutex_enter(&ipst->ips_numvifs_mutex);
2167 	num_of_vifs = ipst->ips_numvifs;
2168 	mutex_exit(&ipst->ips_numvifs_mutex);
2169 	for (vifp = ipst->ips_vifs, vifi = 0;
2170 	    vifi < num_of_vifs;
2171 	    vifp++, vifi++) {
2172 		if (!lock_good_vif(vifp))
2173 			continue;
2174 		if ((rt->mfc_ttls[vifi] > 0) &&
2175 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2176 			/*
2177 			 * lock_good_vif should not have succedded if
2178 			 * v_ipif is null.
2179 			 */
2180 			ASSERT(vifp->v_ipif != NULL);
2181 			vifp->v_pkt_out++;
2182 			vifp->v_bytes_out += plen;
2183 			MC_SEND(ipha, mp, vifp, dst);
2184 			ipst->ips_mrtstat->mrts_fwd_out++;
2185 		}
2186 		unlock_good_vif(vifp);
2187 	}
2188 	if (tunnel_src != 0)
2189 		return (1);
2190 	else
2191 		return (0);
2192 }
2193 
2194 /*
2195  * Send the packet on physical interface.
2196  * Caller assumes can continue to use mp on return.
2197  */
2198 /* ARGSUSED */
2199 static void
2200 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2201 {
2202 	mblk_t 	*mp_copy;
2203 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2204 
2205 	/* Make a new reference to the packet */
2206 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2207 	if (mp_copy == NULL) {
2208 		ipst->ips_mrtstat->mrts_fwd_drop++;
2209 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2210 		return;
2211 	}
2212 	if (vifp->v_rate_limit <= 0)
2213 		tbf_send_packet(vifp, mp_copy);
2214 	else  {
2215 		if (ipst->ips_ip_mrtdebug > 1) {
2216 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2217 			    "phyint_send: tbf_contr rate %d "
2218 			    "vifp 0x%p mp 0x%p dst 0x%x",
2219 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2220 		}
2221 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2222 	}
2223 }
2224 
2225 /*
2226  * Send the whole packet for REGISTER encapsulation to PIM daemon
2227  * Caller assumes it can continue to use mp on return.
2228  */
2229 /* ARGSUSED */
2230 static void
2231 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2232 {
2233 	struct igmpmsg	*im;
2234 	mblk_t		*mp_copy;
2235 	ipha_t		*ipha_copy;
2236 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2237 
2238 	if (ipst->ips_ip_mrtdebug > 1) {
2239 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2240 		    "register_send: src %x, dst %x\n",
2241 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2242 	}
2243 
2244 	/*
2245 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2246 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2247 	 * ethernet driver will.
2248 	 */
2249 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2250 	if (mp_copy == NULL) {
2251 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2252 		if (ipst->ips_ip_mrtdebug > 3) {
2253 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2254 			    "register_send: allocb failure.");
2255 		}
2256 		return;
2257 	}
2258 
2259 	/*
2260 	 * Bump write pointer to account for igmpmsg being added.
2261 	 */
2262 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2263 
2264 	/*
2265 	 * Chain packet to new mblk_t.
2266 	 */
2267 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2268 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2269 		if (ipst->ips_ip_mrtdebug > 3) {
2270 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2271 			    "register_send: copymsg failure.");
2272 		}
2273 		freeb(mp_copy);
2274 		return;
2275 	}
2276 
2277 	/*
2278 	 * icmp_rput() asserts that IP version field is set to an
2279 	 * appropriate version. Hence, the struct igmpmsg that this really
2280 	 * becomes, needs to have the correct IP version field.
2281 	 */
2282 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2283 	*ipha_copy = multicast_encap_iphdr;
2284 
2285 	/*
2286 	 * The kernel uses the struct igmpmsg header to encode the messages to
2287 	 * the multicast routing daemon. Fill in the fields in the header
2288 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2289 	 */
2290 	im = (struct igmpmsg *)mp_copy->b_rptr;
2291 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2292 	im->im_src.s_addr = ipha->ipha_src;
2293 	im->im_dst.s_addr = ipha->ipha_dst;
2294 
2295 	/*
2296 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2297 	 * header with renamed fields and the multicast routing daemon uses
2298 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2299 	 */
2300 	im->im_mbz = 0;
2301 
2302 	++ipst->ips_mrtstat->mrts_upcalls;
2303 	if (!canputnext(RD(ipst->ips_ip_g_mrouter))) {
2304 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2305 		if (ipst->ips_ip_mrtdebug > 3) {
2306 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2307 			    "register_send: register upcall failure.");
2308 		}
2309 		freemsg(mp_copy);
2310 	} else {
2311 		putnext(RD(ipst->ips_ip_g_mrouter), mp_copy);
2312 	}
2313 }
2314 
2315 /*
2316  * pim_validate_cksum handles verification of the checksum in the
2317  * pim header.  For PIM Register packets, the checksum is calculated
2318  * across the PIM header only.  For all other packets, the checksum
2319  * is for the PIM header and remainder of the packet.
2320  *
2321  * returns: B_TRUE, if checksum is okay.
2322  *          B_FALSE, if checksum is not valid.
2323  */
2324 static boolean_t
2325 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2326 {
2327 	mblk_t *mp_dup;
2328 
2329 	if ((mp_dup = dupmsg(mp)) == NULL)
2330 		return (B_FALSE);
2331 
2332 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2333 	if (pimp->pim_type == PIM_REGISTER)
2334 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2335 	if (IP_CSUM(mp_dup, 0, 0)) {
2336 		freemsg(mp_dup);
2337 		return (B_FALSE);
2338 	}
2339 	freemsg(mp_dup);
2340 	return (B_TRUE);
2341 }
2342 
2343 /*
2344  * int
2345  * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
2346  *	IP Protocol 103. Register messages are decapsulated and sent
2347  *	onto multicast forwarding.
2348  */
2349 int
2350 pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
2351 {
2352 	ipha_t		*eip, *ip;
2353 	int		iplen, pimlen, iphlen;
2354 	struct pim	*pimp;	/* pointer to a pim struct */
2355 	uint32_t	*reghdr;
2356 	ip_stack_t	*ipst = ill->ill_ipst;
2357 
2358 	/*
2359 	 * Pullup the msg for PIM protocol processing.
2360 	 */
2361 	if (pullupmsg(mp, -1) == 0) {
2362 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2363 		freemsg(mp);
2364 		return (-1);
2365 	}
2366 
2367 	ip = (ipha_t *)mp->b_rptr;
2368 	iplen = ip->ipha_length;
2369 	iphlen = IPH_HDR_LENGTH(ip);
2370 	pimlen = ntohs(iplen) - iphlen;
2371 
2372 	/*
2373 	 * Validate lengths
2374 	 */
2375 	if (pimlen < PIM_MINLEN) {
2376 		++ipst->ips_mrtstat->mrts_pim_malformed;
2377 		if (ipst->ips_ip_mrtdebug > 1) {
2378 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2379 			    "pim_input: length not at least minlen");
2380 		}
2381 		freemsg(mp);
2382 		return (-1);
2383 	}
2384 
2385 	/*
2386 	 * Point to the PIM header.
2387 	 */
2388 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2389 
2390 	/*
2391 	 * Check the version number.
2392 	 */
2393 	if (pimp->pim_vers != PIM_VERSION) {
2394 		++ipst->ips_mrtstat->mrts_pim_badversion;
2395 		if (ipst->ips_ip_mrtdebug > 1) {
2396 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2397 			    "pim_input: unknown version of PIM");
2398 		}
2399 		freemsg(mp);
2400 		return (-1);
2401 	}
2402 
2403 	/*
2404 	 * Validate the checksum
2405 	 */
2406 	if (!pim_validate_cksum(mp, ip, pimp)) {
2407 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2408 		if (ipst->ips_ip_mrtdebug > 1) {
2409 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2410 			    "pim_input: invalid checksum");
2411 		}
2412 		freemsg(mp);
2413 		return (-1);
2414 	}
2415 
2416 	if (pimp->pim_type != PIM_REGISTER)
2417 		return (0);
2418 
2419 	reghdr = (uint32_t *)(pimp + 1);
2420 	eip = (ipha_t *)(reghdr + 1);
2421 
2422 	/*
2423 	 * check if the inner packet is destined to mcast group
2424 	 */
2425 	if (!CLASSD(eip->ipha_dst)) {
2426 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2427 		if (ipst->ips_ip_mrtdebug > 1) {
2428 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2429 			    "pim_input: Inner pkt not mcast .. !");
2430 		}
2431 		freemsg(mp);
2432 		return (-1);
2433 	}
2434 	if (ipst->ips_ip_mrtdebug > 1) {
2435 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2436 		    "register from %x, to %x, len %d",
2437 		    ntohl(eip->ipha_src),
2438 		    ntohl(eip->ipha_dst),
2439 		    ntohs(eip->ipha_length));
2440 	}
2441 	/*
2442 	 * If the null register bit is not set, decapsulate
2443 	 * the packet before forwarding it.
2444 	 */
2445 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
2446 		mblk_t *mp_copy;
2447 
2448 		/* Copy the message */
2449 		if ((mp_copy = copymsg(mp)) == NULL) {
2450 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2451 			freemsg(mp);
2452 			return (-1);
2453 		}
2454 
2455 		/*
2456 		 * Decapsulate the packet and give it to
2457 		 * register_mforward.
2458 		 */
2459 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
2460 		    sizeof (*reghdr);
2461 		if (register_mforward(q, mp_copy, ill) != 0) {
2462 			freemsg(mp);
2463 			return (-1);
2464 		}
2465 	}
2466 
2467 	/*
2468 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2469 	 * PIM socket. For Solaris it is done right after pim_input() is
2470 	 * called.
2471 	 */
2472 	return (0);
2473 }
2474 
2475 /*
2476  * PIM sparse mode hook.  Called by pim_input after decapsulating
2477  * the packet. Loop back the packet, as if we have received it.
2478  * In pim_input() we have to check if the destination is a multicast address.
2479  */
2480 /* ARGSUSED */
2481 static int
2482 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
2483 {
2484 	ip_stack_t	*ipst = ill->ill_ipst;
2485 
2486 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2487 
2488 	if (ipst->ips_ip_mrtdebug > 3) {
2489 		ipha_t *ipha;
2490 
2491 		ipha = (ipha_t *)mp->b_rptr;
2492 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2493 		    "register_mforward: src %x, dst %x\n",
2494 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2495 	}
2496 	/*
2497 	 * Need to pass in to ip_mforward() the information that the
2498 	 * packet has arrived on the register_vif. We use the solution that
2499 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
2500 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
2501 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
2502 	 * tunnel there.) This is safe since ip_rput() either frees the packet
2503 	 * or passes it to ip_mforward(). We use
2504 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
2505 	 * register vif. If in the future we have more than one register vifs,
2506 	 * then this will need re-examination.
2507 	 */
2508 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
2509 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2510 	ip_rput(q, mp);
2511 	return (0);
2512 }
2513 
2514 /*
2515  * Send an encapsulated packet.
2516  * Caller assumes can continue to use mp when routine returns.
2517  */
2518 /* ARGSUSED */
2519 static void
2520 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2521 {
2522 	mblk_t 	*mp_copy;
2523 	ipha_t 	*ipha_copy;
2524 	size_t	len;
2525 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2526 
2527 	if (ipst->ips_ip_mrtdebug > 1) {
2528 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2529 		    "encap_send: vif %ld enter",
2530 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2531 	}
2532 	len = ntohs(ipha->ipha_length);
2533 
2534 	/*
2535 	 * Copy the old packet & pullup it's IP header into the
2536 	 * new mbuf so we can modify it.  Try to fill the new
2537 	 * mbuf since if we don't the ethernet driver will.
2538 	 */
2539 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2540 	if (mp_copy == NULL)
2541 		return;
2542 	mp_copy->b_rptr += 32;
2543 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2544 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2545 		freeb(mp_copy);
2546 		return;
2547 	}
2548 
2549 	/*
2550 	 * Fill in the encapsulating IP header.
2551 	 * Remote tunnel dst in rmt_addr, from add_vif().
2552 	 */
2553 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2554 	*ipha_copy = multicast_encap_iphdr;
2555 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2556 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2557 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2558 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2559 	ASSERT(ipha_copy->ipha_ident == 0);
2560 
2561 	/* Turn the encapsulated IP header back into a valid one. */
2562 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2563 	ipha->ipha_ttl--;
2564 	ipha->ipha_hdr_checksum = 0;
2565 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2566 
2567 	if (ipst->ips_ip_mrtdebug > 1) {
2568 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2569 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2570 	}
2571 	if (vifp->v_rate_limit <= 0)
2572 		tbf_send_packet(vifp, mp_copy);
2573 	else
2574 		/* ipha is from the original header */
2575 		tbf_control(vifp, mp_copy, ipha);
2576 }
2577 
2578 /*
2579  * De-encapsulate a packet and feed it back through IP input.
2580  * This routine is called whenever IP gets a packet with prototype
2581  * IPPROTO_ENCAP and a local destination address.
2582  */
2583 void
2584 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
2585 {
2586 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2587 	ipha_t		*ipha_encap;
2588 	int		hlen = IPH_HDR_LENGTH(ipha);
2589 	ipaddr_t	src;
2590 	struct vif	*vifp;
2591 	ip_stack_t	*ipst = ill->ill_ipst;
2592 
2593 	/*
2594 	 * Dump the packet if it's not to a multicast destination or if
2595 	 * we don't have an encapsulating tunnel with the source.
2596 	 * Note:  This code assumes that the remote site IP address
2597 	 * uniquely identifies the tunnel (i.e., that this site has
2598 	 * at most one tunnel with the remote site).
2599 	 */
2600 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2601 	if (!CLASSD(ipha_encap->ipha_dst)) {
2602 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2603 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2604 		freemsg(mp);
2605 		return;
2606 	}
2607 	src = (ipaddr_t)ipha->ipha_src;
2608 	mutex_enter(&ipst->ips_last_encap_lock);
2609 	if (src != ipst->ips_last_encap_src) {
2610 		struct vif *vife;
2611 
2612 		vifp = ipst->ips_vifs;
2613 		vife = vifp + ipst->ips_numvifs;
2614 		ipst->ips_last_encap_src = src;
2615 		ipst->ips_last_encap_vif = 0;
2616 		for (; vifp < vife; ++vifp) {
2617 			if (!lock_good_vif(vifp))
2618 				continue;
2619 			if (vifp->v_rmt_addr.s_addr == src) {
2620 				if (vifp->v_flags & VIFF_TUNNEL)
2621 					ipst->ips_last_encap_vif = vifp;
2622 				if (ipst->ips_ip_mrtdebug > 1) {
2623 					(void) mi_strlog(ipst->ips_ip_g_mrouter,
2624 					    1, SL_TRACE,
2625 					    "ip_mroute_decap: good tun "
2626 					    "vif %ld with %x",
2627 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2628 					    ntohl(src));
2629 				}
2630 				unlock_good_vif(vifp);
2631 				break;
2632 			}
2633 			unlock_good_vif(vifp);
2634 		}
2635 	}
2636 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2637 		mutex_exit(&ipst->ips_last_encap_lock);
2638 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2639 		freemsg(mp);
2640 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2641 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2642 		return;
2643 	}
2644 	mutex_exit(&ipst->ips_last_encap_lock);
2645 
2646 	/*
2647 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2648 	 * verify that the packet arrived over the correct vif.)  We use b_prev
2649 	 * to pass this information. This is safe since the ip_rput either
2650 	 * frees the packet or passes it to ip_mforward.
2651 	 */
2652 	mp->b_prev = (mblk_t *)(uintptr_t)src;
2653 	mp->b_rptr += hlen;
2654 	/* Feed back into ip_rput as an M_DATA. */
2655 	ip_rput(q, mp);
2656 }
2657 
2658 /*
2659  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2660  * (stream closed).  Called as writer.
2661  */
2662 void
2663 reset_mrt_vif_ipif(ipif_t *ipif)
2664 {
2665 	vifi_t vifi, tmp_vifi;
2666 	vifi_t num_of_vifs;
2667 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2668 
2669 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2670 
2671 	mutex_enter(&ipst->ips_numvifs_mutex);
2672 	num_of_vifs = ipst->ips_numvifs;
2673 	mutex_exit(&ipst->ips_numvifs_mutex);
2674 
2675 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2676 		tmp_vifi = vifi - 1;
2677 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2678 			(void) del_vif(&tmp_vifi, NULL, NULL, ipst);
2679 		}
2680 	}
2681 }
2682 
2683 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2684 void
2685 reset_mrt_ill(ill_t *ill)
2686 {
2687 	struct mfc		*rt;
2688 	struct rtdetq	*rte;
2689 	int			i;
2690 	ip_stack_t	*ipst = ill->ill_ipst;
2691 
2692 	for (i = 0; i < MFCTBLSIZ; i++) {
2693 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2694 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2695 			if (ipst->ips_ip_mrtdebug > 1) {
2696 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
2697 				    SL_TRACE,
2698 				    "reset_mrt_ill: mfctable [%d]", i);
2699 			}
2700 			while (rt != NULL) {
2701 				mutex_enter(&rt->mfc_mutex);
2702 				while ((rte = rt->mfc_rte) != NULL) {
2703 					if (rte->ill == ill) {
2704 						if (ipst->ips_ip_mrtdebug > 1) {
2705 						(void) mi_strlog(
2706 						    ipst->ips_ip_g_mrouter,
2707 						    1, SL_TRACE,
2708 						    "reset_mrt_ill: "
2709 						    "ill 0x%p", ill);
2710 						}
2711 						rt->mfc_rte = rte->rte_next;
2712 						freemsg(rte->mp);
2713 						mi_free((char *)rte);
2714 					}
2715 				}
2716 				mutex_exit(&rt->mfc_mutex);
2717 				rt = rt->mfc_next;
2718 			}
2719 		}
2720 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2721 	}
2722 }
2723 
2724 /*
2725  * Token bucket filter module.
2726  * The ipha is for mcastgrp destination for phyint and encap.
2727  */
2728 static void
2729 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2730 {
2731 	size_t 	p_len =  msgdsize(mp);
2732 	struct tbf	*t    = vifp->v_tbf;
2733 	timeout_id_t id = 0;
2734 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2735 
2736 	/* Drop if packet is too large */
2737 	if (p_len > MAX_BKT_SIZE) {
2738 		ipst->ips_mrtstat->mrts_pkt2large++;
2739 		freemsg(mp);
2740 		return;
2741 	}
2742 	if (ipst->ips_ip_mrtdebug > 1) {
2743 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2744 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2745 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2746 		    ntohl(ipha->ipha_dst));
2747 	}
2748 
2749 	mutex_enter(&t->tbf_lock);
2750 
2751 	tbf_update_tokens(vifp);
2752 
2753 	/*
2754 	 * If there are enough tokens,
2755 	 * and the queue is empty, send this packet out.
2756 	 */
2757 	if (ipst->ips_ip_mrtdebug > 1) {
2758 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2759 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2760 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2761 		    t->tbf_q_len);
2762 	}
2763 	/* No packets are queued */
2764 	if (t->tbf_q_len == 0) {
2765 		/* queue empty, send packet if enough tokens */
2766 		if (p_len <= t->tbf_n_tok) {
2767 			t->tbf_n_tok -= p_len;
2768 			mutex_exit(&t->tbf_lock);
2769 			tbf_send_packet(vifp, mp);
2770 			return;
2771 		} else {
2772 			/* Queue packet and timeout till later */
2773 			tbf_queue(vifp, mp);
2774 			ASSERT(vifp->v_timeout_id == 0);
2775 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2776 			    TBF_REPROCESS);
2777 		}
2778 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2779 		/* Finite queue length, so queue pkts and process queue */
2780 		tbf_queue(vifp, mp);
2781 		tbf_process_q(vifp);
2782 	} else {
2783 		/* Check that we have UDP header with IP header */
2784 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2785 					sizeof (struct udphdr);
2786 
2787 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2788 			if (!pullupmsg(mp, hdr_length)) {
2789 				freemsg(mp);
2790 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2791 				    "vif %ld src 0x%x dst 0x%x\n",
2792 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2793 				    ntohl(ipha->ipha_src),
2794 				    ntohl(ipha->ipha_dst)));
2795 				mutex_exit(&vifp->v_tbf->tbf_lock);
2796 				return;
2797 			} else
2798 				/* Have to reassign ipha after pullupmsg */
2799 				ipha = (ipha_t *)mp->b_rptr;
2800 		}
2801 		/*
2802 		 * Queue length too much,
2803 		 * try to selectively dq, or queue and process
2804 		 */
2805 		if (!tbf_dq_sel(vifp, ipha)) {
2806 			ipst->ips_mrtstat->mrts_q_overflow++;
2807 			freemsg(mp);
2808 		} else {
2809 			tbf_queue(vifp, mp);
2810 			tbf_process_q(vifp);
2811 		}
2812 	}
2813 	if (t->tbf_q_len == 0) {
2814 		id = vifp->v_timeout_id;
2815 		vifp->v_timeout_id = 0;
2816 	}
2817 	mutex_exit(&vifp->v_tbf->tbf_lock);
2818 	if (id != 0)
2819 		(void) untimeout(id);
2820 }
2821 
2822 /*
2823  * Adds a packet to the tbf queue at the interface.
2824  * The ipha is for mcastgrp destination for phyint and encap.
2825  */
2826 static void
2827 tbf_queue(struct vif *vifp, mblk_t *mp)
2828 {
2829 	struct tbf	*t = vifp->v_tbf;
2830 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2831 
2832 	if (ipst->ips_ip_mrtdebug > 1) {
2833 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2834 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2835 	}
2836 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2837 
2838 	if (t->tbf_t == NULL) {
2839 		/* Queue was empty */
2840 		t->tbf_q = mp;
2841 	} else {
2842 		/* Insert at tail */
2843 		t->tbf_t->b_next = mp;
2844 	}
2845 	/* set new tail pointer */
2846 	t->tbf_t = mp;
2847 
2848 	mp->b_next = mp->b_prev = NULL;
2849 
2850 	t->tbf_q_len++;
2851 }
2852 
2853 /*
2854  * Process the queue at the vif interface.
2855  * Drops the tbf_lock when sending packets.
2856  *
2857  * NOTE : The caller should quntimeout if the queue length is 0.
2858  */
2859 static void
2860 tbf_process_q(struct vif *vifp)
2861 {
2862 	mblk_t	*mp;
2863 	struct tbf	*t = vifp->v_tbf;
2864 	size_t	len;
2865 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2866 
2867 	if (ipst->ips_ip_mrtdebug > 1) {
2868 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2869 		    "tbf_process_q 1: vif %ld qlen = %d",
2870 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2871 	}
2872 
2873 	/*
2874 	 * Loop through the queue at the interface and send
2875 	 * as many packets as possible.
2876 	 */
2877 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2878 
2879 	while (t->tbf_q_len > 0) {
2880 		mp = t->tbf_q;
2881 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2882 
2883 		/* Determine if the packet can be sent */
2884 		if (len <= t->tbf_n_tok) {
2885 			/*
2886 			 * If so, reduce no. of tokens, dequeue the packet,
2887 			 * send the packet.
2888 			 */
2889 			t->tbf_n_tok -= len;
2890 
2891 			t->tbf_q = mp->b_next;
2892 			if (--t->tbf_q_len == 0) {
2893 				t->tbf_t = NULL;
2894 			}
2895 			mp->b_next = NULL;
2896 			/* Exit mutex before sending packet, then re-enter */
2897 			mutex_exit(&t->tbf_lock);
2898 			tbf_send_packet(vifp, mp);
2899 			mutex_enter(&t->tbf_lock);
2900 		} else
2901 			break;
2902 	}
2903 }
2904 
2905 /* Called at tbf timeout to update tokens, process q and reset timer.  */
2906 static void
2907 tbf_reprocess_q(void *arg)
2908 {
2909 	struct vif *vifp = arg;
2910 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2911 
2912 	mutex_enter(&vifp->v_tbf->tbf_lock);
2913 	vifp->v_timeout_id = 0;
2914 	tbf_update_tokens(vifp);
2915 
2916 	tbf_process_q(vifp);
2917 
2918 	if (vifp->v_tbf->tbf_q_len > 0) {
2919 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2920 		    TBF_REPROCESS);
2921 	}
2922 	mutex_exit(&vifp->v_tbf->tbf_lock);
2923 
2924 	if (ipst->ips_ip_mrtdebug > 1) {
2925 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2926 		    "tbf_reprcess_q: vif %ld timeout id = %p",
2927 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
2928 	}
2929 }
2930 
2931 /*
2932  * Function that will selectively discard a member of the tbf queue,
2933  * based on the precedence value and the priority.
2934  *
2935  * NOTE : The caller should quntimeout if the queue length is 0.
2936  */
2937 static int
2938 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
2939 {
2940 	uint_t		p;
2941 	struct tbf		*t = vifp->v_tbf;
2942 	mblk_t		**np;
2943 	mblk_t		*last, *mp;
2944 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2945 
2946 	if (ipst->ips_ip_mrtdebug > 1) {
2947 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2948 		    "dq_sel: vif %ld dst 0x%x",
2949 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
2950 	}
2951 
2952 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2953 	p = priority(vifp, ipha);
2954 
2955 	np = &t->tbf_q;
2956 	last = NULL;
2957 	while ((mp = *np) != NULL) {
2958 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
2959 			*np = mp->b_next;
2960 			/* If removing the last packet, fix the tail pointer */
2961 			if (mp == t->tbf_t)
2962 				t->tbf_t = last;
2963 			mp->b_prev = mp->b_next = NULL;
2964 			freemsg(mp);
2965 			/*
2966 			 * It's impossible for the queue to be empty, but
2967 			 * we check anyway.
2968 			 */
2969 			if (--t->tbf_q_len == 0) {
2970 				t->tbf_t = NULL;
2971 			}
2972 			ipst->ips_mrtstat->mrts_drop_sel++;
2973 			return (1);
2974 		}
2975 		np = &mp->b_next;
2976 		last = mp;
2977 	}
2978 	return (0);
2979 }
2980 
2981 /* Sends packet, 2 cases - encap tunnel, phyint.  */
2982 static void
2983 tbf_send_packet(struct vif *vifp, mblk_t *mp)
2984 {
2985 	ipif_t  *ipif;
2986 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2987 
2988 	/* If encap tunnel options */
2989 	if (vifp->v_flags & VIFF_TUNNEL)  {
2990 		if (ipst->ips_ip_mrtdebug > 1) {
2991 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
2992 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
2993 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
2994 		}
2995 
2996 		/*
2997 		 * Feed into ip_wput which will set the ident field and
2998 		 * checksum the encapsulating header.
2999 		 * BSD gets the cached route vifp->v_route from ip_output()
3000 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3001 		 */
3002 		put(vifp->v_ipif->ipif_wq, mp);
3003 		return;
3004 
3005 		/* phyint */
3006 	} else {
3007 		/* Need to loop back to members on the outgoing interface. */
3008 		ipha_t  *ipha;
3009 		ipaddr_t    dst;
3010 		ipha  = (ipha_t *)mp->b_rptr;
3011 		dst  = ipha->ipha_dst;
3012 		ipif = vifp->v_ipif;
3013 
3014 		mutex_enter(&ipif->ipif_ill->ill_lock);
3015 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
3016 			/*
3017 			 * The packet is not yet reassembled, thus we need to
3018 			 * pass it to ip_rput_local for checksum verification
3019 			 * and reassembly (and fanout the user stream).
3020 			 */
3021 			mblk_t 	*mp_loop;
3022 			ire_t	*ire;
3023 
3024 			mutex_exit(&ipif->ipif_ill->ill_lock);
3025 			if (ipst->ips_ip_mrtdebug > 1) {
3026 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
3027 				    SL_TRACE,
3028 				    "tbf_send_pkt: loopback vif %ld",
3029 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3030 			}
3031 			mp_loop = copymsg(mp);
3032 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
3033 			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
3034 
3035 			if (mp_loop != NULL && ire != NULL) {
3036 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
3037 				    ((ipha_t *)mp_loop->b_rptr),
3038 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
3039 			} else {
3040 				/* Either copymsg failed or no ire */
3041 				(void) mi_strlog(ipst->ips_ip_g_mrouter, 1,
3042 				    SL_TRACE,
3043 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
3044 				    "vif %ld\n", mp_loop, ire,
3045 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3046 			}
3047 			if (ire != NULL)
3048 				ire_refrele(ire);
3049 		} else {
3050 			mutex_exit(&ipif->ipif_ill->ill_lock);
3051 		}
3052 		if (ipst->ips_ip_mrtdebug > 1) {
3053 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
3054 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3055 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3056 		}
3057 		ip_rput_forward_multicast(dst, mp, ipif);
3058 	}
3059 }
3060 
3061 /*
3062  * Determine the current time and then the elapsed time (between the last time
3063  * and time now).  Update the no. of tokens in the bucket.
3064  */
3065 static void
3066 tbf_update_tokens(struct vif *vifp)
3067 {
3068 	timespec_t	tp;
3069 	hrtime_t	tm;
3070 	struct tbf	*t = vifp->v_tbf;
3071 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3072 
3073 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3074 
3075 	/* Time in secs and nsecs, rate limit in kbits/sec */
3076 	gethrestime(&tp);
3077 
3078 	/*LINTED*/
3079 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3080 
3081 	/*
3082 	 * This formula is actually
3083 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3084 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3085 	 *
3086 	 * The (1000/1024) was introduced in add_vif to optimize
3087 	 * this divide into a shift.
3088 	 */
3089 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3090 	t->tbf_last_pkt_t = tp;
3091 
3092 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3093 		t->tbf_n_tok = MAX_BKT_SIZE;
3094 	if (ipst->ips_ip_mrtdebug > 1) {
3095 		(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
3096 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3097 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3098 	}
3099 }
3100 
3101 /*
3102  * Priority currently is based on port nos.
3103  * Different forwarding mechanisms have different ways
3104  * of obtaining the port no. Hence, the vif must be
3105  * given along with the packet itself.
3106  *
3107  */
3108 static int
3109 priority(struct vif *vifp, ipha_t *ipha)
3110 {
3111 	int prio;
3112 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3113 
3114 	/* Temporary hack; may add general packet classifier some day */
3115 
3116 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3117 
3118 	/*
3119 	 * The UDP port space is divided up into four priority ranges:
3120 	 * [0, 16384)	: unclassified - lowest priority
3121 	 * [16384, 32768)	: audio - highest priority
3122 	 * [32768, 49152)	: whiteboard - medium priority
3123 	 * [49152, 65536)	: video - low priority
3124 	 */
3125 
3126 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3127 		struct udphdr *udp =
3128 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3129 		switch (ntohs(udp->uh_dport) & 0xc000) {
3130 		case 0x4000:
3131 			prio = 70;
3132 			break;
3133 		case 0x8000:
3134 			prio = 60;
3135 			break;
3136 		case 0xc000:
3137 			prio = 55;
3138 			break;
3139 		default:
3140 			prio = 50;
3141 			break;
3142 		}
3143 		if (ipst->ips_ip_mrtdebug > 1) {
3144 			(void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE,
3145 			    "priority: port %x prio %d\n",
3146 			    ntohs(udp->uh_dport), prio);
3147 		}
3148 	} else
3149 		prio = 50;  /* default priority */
3150 	return (prio);
3151 }
3152 
3153 /*
3154  * End of token bucket filter modifications
3155  */
3156 
3157 
3158 
3159 /*
3160  * Produces data for netstat -M.
3161  */
3162 int
3163 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3164 {
3165 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3166 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3167 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3168 		sizeof (struct mrtstat))) {
3169 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3170 		    (size_t)sizeof (struct mrtstat)));
3171 		return (0);
3172 	}
3173 	return (1);
3174 }
3175 
3176 /*
3177  * Sends info for SNMP's MIB.
3178  */
3179 int
3180 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3181 {
3182 	struct vifctl 	vi;
3183 	vifi_t		vifi;
3184 
3185 	mutex_enter(&ipst->ips_numvifs_mutex);
3186 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3187 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3188 			continue;
3189 		/*
3190 		 * No locks here, an approximation is fine.
3191 		 */
3192 		vi.vifc_vifi = vifi;
3193 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3194 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3195 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3196 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3197 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3198 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3199 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3200 
3201 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3202 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3203 			    (size_t)sizeof (vi)));
3204 			return (0);
3205 		}
3206 	}
3207 	mutex_exit(&ipst->ips_numvifs_mutex);
3208 	return (1);
3209 }
3210 
3211 /*
3212  * Called by ip_snmp_get to send up multicast routing table.
3213  */
3214 int
3215 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3216 {
3217 	int			i, j;
3218 	struct mfc		*rt;
3219 	struct mfcctl	mfcc;
3220 
3221 	/*
3222 	 * Make sure multicast has not been turned off.
3223 	 */
3224 	if (is_mrouter_off(ipst))
3225 		return (1);
3226 
3227 	/* Loop over all hash buckets and their chains */
3228 	for (i = 0; i < MFCTBLSIZ; i++) {
3229 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3230 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3231 			mutex_enter(&rt->mfc_mutex);
3232 			if (rt->mfc_rte != NULL ||
3233 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3234 				mutex_exit(&rt->mfc_mutex);
3235 				continue;
3236 			}
3237 			mfcc.mfcc_origin = rt->mfc_origin;
3238 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3239 			mfcc.mfcc_parent = rt->mfc_parent;
3240 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3241 			mutex_enter(&ipst->ips_numvifs_mutex);
3242 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3243 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3244 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3245 				mfcc.mfcc_ttls[j] = 0;
3246 			mutex_exit(&ipst->ips_numvifs_mutex);
3247 
3248 			mutex_exit(&rt->mfc_mutex);
3249 			if (!snmp_append_data(mp, (char *)&mfcc,
3250 			    sizeof (mfcc))) {
3251 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3252 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3253 				    (size_t)sizeof (mfcc)));
3254 				return (0);
3255 			}
3256 		}
3257 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3258 	}
3259 	return (1);
3260 }
3261