xref: /illumos-gate/usr/src/uts/common/inet/ip/igmp.c (revision 06e1a714)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Internet Group Management Protocol (IGMP) routines.
31  * Multicast Listener Discovery Protocol (MLD) routines.
32  *
33  * Written by Steve Deering, Stanford, May 1988.
34  * Modified by Rosen Sharma, Stanford, Aug 1994.
35  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
36  *
37  * MULTICAST 3.5.1.1
38  */
39 
40 #include <sys/types.h>
41 #include <sys/stream.h>
42 #include <sys/stropts.h>
43 #include <sys/strlog.h>
44 #include <sys/strsun.h>
45 #include <sys/systm.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/cmn_err.h>
49 #include <sys/atomic.h>
50 #include <sys/zone.h>
51 
52 #include <sys/param.h>
53 #include <sys/socket.h>
54 #include <inet/ipclassifier.h>
55 #include <net/if.h>
56 #include <net/route.h>
57 #include <netinet/in.h>
58 #include <netinet/igmp_var.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 
62 #include <inet/common.h>
63 #include <inet/mi.h>
64 #include <inet/nd.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_listutils.h>
69 
70 #include <netinet/igmp.h>
71 #include <inet/ip_if.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 
75 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
76 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
77 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
78 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
79 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
80 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
81 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
82 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
83 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
84 		    slist_t *srclist, mrec_t *next);
85 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
86 		    mcast_record_t rtype, slist_t *flist);
87 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
88 
89 /* Following protected by igmp_timer_lock */
90 static int 	igmp_time_to_next;	/* Time since last timeout */
91 static int 	igmp_timer_fired_last;
92 uint_t		igmp_deferred_next = INFINITY;
93 timeout_id_t	igmp_timeout_id = 0;
94 kmutex_t	igmp_timer_lock;
95 
96 /* Protected by igmp_slowtimeout_lock */
97 timeout_id_t	igmp_slowtimeout_id = 0;
98 kmutex_t	igmp_slowtimeout_lock;
99 
100 /* Following protected by mld_timer_lock */
101 static int 	mld_time_to_next;	/* Time since last timeout */
102 static int 	mld_timer_fired_last;
103 uint_t		mld_deferred_next = INFINITY;
104 timeout_id_t	mld_timeout_id = 0;
105 kmutex_t	mld_timer_lock;
106 
107 /* Protected by mld_slowtimeout_lock */
108 timeout_id_t	mld_slowtimeout_id = 0;
109 kmutex_t	mld_slowtimeout_lock;
110 
111 /*
112  * Macros used to do timer len conversions.  Timer values are always
113  * stored and passed to the timer functions as milliseconds; but the
114  * default values and values from the wire may not be.
115  *
116  * And yes, it's obscure, but decisecond is easier to abbreviate than
117  * "tenths of a second".
118  */
119 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
120 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
121 
122 /*
123  * The first multicast join will trigger the igmp timers / mld timers
124  * The unit for next is milliseconds.
125  */
126 void
127 igmp_start_timers(unsigned next)
128 {
129 	int	time_left;
130 	/* Protected by igmp_timer_lock */
131 	static  boolean_t igmp_timer_setter_active;
132 	int	ret;
133 
134 	ASSERT(next != 0 && next != INFINITY);
135 
136 	mutex_enter(&igmp_timer_lock);
137 
138 	if (igmp_timer_setter_active) {
139 		/*
140 		 * Serialize timer setters, one at a time. If the
141 		 * timer is currently being set by someone,
142 		 * just record the next time when it has to be
143 		 * invoked and return. The current setter will
144 		 * take care.
145 		 */
146 		igmp_time_to_next = MIN(igmp_time_to_next, next);
147 		mutex_exit(&igmp_timer_lock);
148 		return;
149 	} else {
150 		igmp_timer_setter_active = B_TRUE;
151 	}
152 	if (igmp_timeout_id == 0) {
153 		/*
154 		 * The timer is inactive. We need to start a timer
155 		 */
156 		igmp_time_to_next = next;
157 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
158 		    MSEC_TO_TICK(igmp_time_to_next));
159 		igmp_timer_setter_active = B_FALSE;
160 		mutex_exit(&igmp_timer_lock);
161 		return;
162 	}
163 
164 	/*
165 	 * The timer was scheduled sometime back for firing in
166 	 * 'igmp_time_to_next' ms and is active. We need to
167 	 * reschedule the timeout if the new 'next' will happen
168 	 * earlier than the currently scheduled timeout
169 	 */
170 	time_left = igmp_timer_fired_last +
171 	    MSEC_TO_TICK(igmp_time_to_next) - ddi_get_lbolt();
172 	if (time_left < MSEC_TO_TICK(next)) {
173 		igmp_timer_setter_active = B_FALSE;
174 		mutex_exit(&igmp_timer_lock);
175 		return;
176 	}
177 
178 	mutex_exit(&igmp_timer_lock);
179 	ret = untimeout(igmp_timeout_id);
180 	mutex_enter(&igmp_timer_lock);
181 	/*
182 	 * The timeout was cancelled, or the timeout handler
183 	 * completed, while we were blocked in the untimeout.
184 	 * No other thread could have set the timer meanwhile
185 	 * since we serialized all the timer setters. Thus
186 	 * no timer is currently active nor executing nor will
187 	 * any timer fire in the future. We start the timer now
188 	 * if needed.
189 	 */
190 	if (ret == -1) {
191 		ASSERT(igmp_timeout_id == 0);
192 	} else {
193 		ASSERT(igmp_timeout_id != 0);
194 		igmp_timeout_id = 0;
195 	}
196 	if (igmp_time_to_next != 0) {
197 		igmp_time_to_next = MIN(igmp_time_to_next, next);
198 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
199 		    MSEC_TO_TICK(igmp_time_to_next));
200 	}
201 	igmp_timer_setter_active = B_FALSE;
202 	mutex_exit(&igmp_timer_lock);
203 }
204 
205 /*
206  * mld_start_timers:
207  * The unit for next is milliseconds.
208  */
209 void
210 mld_start_timers(unsigned next)
211 {
212 	int	time_left;
213 	/* Protedted by mld_timer_lock */
214 	static  boolean_t mld_timer_setter_active;
215 	int	ret;
216 
217 	ASSERT(next != 0 && next != INFINITY);
218 
219 	mutex_enter(&mld_timer_lock);
220 	if (mld_timer_setter_active) {
221 		/*
222 		 * Serialize timer setters, one at a time. If the
223 		 * timer is currently being set by someone,
224 		 * just record the next time when it has to be
225 		 * invoked and return. The current setter will
226 		 * take care.
227 		 */
228 		mld_time_to_next = MIN(mld_time_to_next, next);
229 		mutex_exit(&mld_timer_lock);
230 		return;
231 	} else {
232 		mld_timer_setter_active = B_TRUE;
233 	}
234 	if (mld_timeout_id == 0) {
235 		/*
236 		 * The timer is inactive. We need to start a timer
237 		 */
238 		mld_time_to_next = next;
239 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
240 		    MSEC_TO_TICK(mld_time_to_next));
241 		mld_timer_setter_active = B_FALSE;
242 		mutex_exit(&mld_timer_lock);
243 		return;
244 	}
245 
246 	/*
247 	 * The timer was scheduled sometime back for firing in
248 	 * 'igmp_time_to_next' ms and is active. We need to
249 	 * reschedule the timeout if the new 'next' will happen
250 	 * earlier than the currently scheduled timeout
251 	 */
252 	time_left = mld_timer_fired_last +
253 	    MSEC_TO_TICK(mld_time_to_next) - ddi_get_lbolt();
254 	if (time_left < MSEC_TO_TICK(next)) {
255 		mld_timer_setter_active = B_FALSE;
256 		mutex_exit(&mld_timer_lock);
257 		return;
258 	}
259 
260 	mutex_exit(&mld_timer_lock);
261 	ret = untimeout(mld_timeout_id);
262 	mutex_enter(&mld_timer_lock);
263 	/*
264 	 * The timeout was cancelled, or the timeout handler
265 	 * completed, while we were blocked in the untimeout.
266 	 * No other thread could have set the timer meanwhile
267 	 * since we serialized all the timer setters. Thus
268 	 * no timer is currently active nor executing nor will
269 	 * any timer fire in the future. We start the timer now
270 	 * if needed.
271 	 */
272 	if (ret == -1) {
273 		ASSERT(mld_timeout_id == 0);
274 	} else {
275 		ASSERT(mld_timeout_id != 0);
276 		mld_timeout_id = 0;
277 	}
278 	if (mld_time_to_next != 0) {
279 		mld_time_to_next = MIN(mld_time_to_next, next);
280 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
281 		    MSEC_TO_TICK(mld_time_to_next));
282 	}
283 	mld_timer_setter_active = B_FALSE;
284 	mutex_exit(&mld_timer_lock);
285 }
286 
287 /*
288  * igmp_input:
289  * Return NULL for a bad packet that is discarded here.
290  * Return mp if the message is OK and should be handed to "raw" receivers.
291  * Callers of igmp_input() may need to reinitialize variables that were copied
292  * from the mblk as this calls pullupmsg().
293  */
294 /* ARGSUSED */
295 mblk_t *
296 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
297 {
298 	igmpa_t 	*igmpa;
299 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
300 	int		iphlen, igmplen, mblklen;
301 	ilm_t 		*ilm;
302 	uint32_t	src, dst;
303 	uint32_t 	group;
304 	uint_t		next;
305 	ipif_t 		*ipif;
306 
307 	ASSERT(ill != NULL);
308 	ASSERT(!ill->ill_isv6);
309 	++igmpstat.igps_rcv_total;
310 
311 	mblklen = MBLKL(mp);
312 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
313 		++igmpstat.igps_rcv_tooshort;
314 		goto bad_pkt;
315 	}
316 	igmplen = ntohs(ipha->ipha_length) - iphlen;
317 	/*
318 	 * Since msg sizes are more variable with v3, just pullup the
319 	 * whole thing now.
320 	 */
321 	if (MBLKL(mp) < (igmplen + iphlen)) {
322 		mblk_t *mp1;
323 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
324 			++igmpstat.igps_rcv_tooshort;
325 			goto bad_pkt;
326 		}
327 		freemsg(mp);
328 		mp = mp1;
329 		ipha = (ipha_t *)(mp->b_rptr);
330 	}
331 
332 	/*
333 	 * Validate lengths
334 	 */
335 	if (igmplen < IGMP_MINLEN) {
336 		++igmpstat.igps_rcv_tooshort;
337 		goto bad_pkt;
338 	}
339 	/*
340 	 * Validate checksum
341 	 */
342 	if (IP_CSUM(mp, iphlen, 0)) {
343 		++igmpstat.igps_rcv_badsum;
344 		goto bad_pkt;
345 	}
346 
347 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
348 	src = ipha->ipha_src;
349 	dst = ipha->ipha_dst;
350 	if (ip_debug > 1)
351 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
352 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
353 		    (int)ntohl(src), (int)ntohl(dst),
354 		    ill->ill_name);
355 
356 	switch (igmpa->igmpa_type) {
357 	case IGMP_MEMBERSHIP_QUERY:
358 		/*
359 		 * packet length differentiates between v1/v2 and v3
360 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
361 		 */
362 		if (igmplen == IGMP_MINLEN) {
363 			next = igmp_query_in(ipha, igmpa, ill);
364 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
365 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
366 			    igmplen);
367 		} else {
368 			++igmpstat.igps_rcv_tooshort;
369 			goto bad_pkt;
370 		}
371 		if (next == 0)
372 			goto bad_pkt;
373 
374 		if (next != INFINITY)
375 			igmp_start_timers(next);
376 
377 		break;
378 
379 	case IGMP_V1_MEMBERSHIP_REPORT:
380 	case IGMP_V2_MEMBERSHIP_REPORT:
381 		/*
382 		 * For fast leave to work, we have to know that we are the
383 		 * last person to send a report for this group. Reports
384 		 * generated by us are looped back since we could potentially
385 		 * be a multicast router, so discard reports sourced by me.
386 		 */
387 		mutex_enter(&ill->ill_lock);
388 		for (ipif = ill->ill_ipif; ipif != NULL;
389 		    ipif = ipif->ipif_next) {
390 			if (ipif->ipif_lcl_addr == src) {
391 				if (ip_debug > 1) {
392 					(void) mi_strlog(ill->ill_rq,
393 					    1,
394 					    SL_TRACE,
395 					    "igmp_input: we are only "
396 					    "member src 0x%x ipif_local 0x%x",
397 					    (int)ntohl(src),
398 					    (int)
399 					    ntohl(ipif->ipif_lcl_addr));
400 				}
401 				mutex_exit(&ill->ill_lock);
402 				return (mp);
403 			}
404 		}
405 		mutex_exit(&ill->ill_lock);
406 
407 		++igmpstat.igps_rcv_reports;
408 		group = igmpa->igmpa_group;
409 		if (!CLASSD(group)) {
410 			++igmpstat.igps_rcv_badreports;
411 			goto bad_pkt;
412 		}
413 
414 		/*
415 		 * KLUDGE: if the IP source address of the report has an
416 		 * unspecified (i.e., zero) subnet number, as is allowed for
417 		 * a booting host, replace it with the correct subnet number
418 		 * so that a process-level multicast routing demon can
419 		 * determine which subnet it arrived from.  This is necessary
420 		 * to compensate for the lack of any way for a process to
421 		 * determine the arrival interface of an incoming packet.
422 		 *
423 		 * Requires that a copy of *this* message it passed up
424 		 * to the raw interface which is done by our caller.
425 		 */
426 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
427 			/* Pick the first ipif on this ill */
428 			mutex_enter(&ill->ill_lock);
429 			src = ill->ill_ipif->ipif_subnet;
430 			mutex_exit(&ill->ill_lock);
431 			ip1dbg(("igmp_input: changed src to 0x%x\n",
432 			    (int)ntohl(src)));
433 			ipha->ipha_src = src;
434 		}
435 
436 		/*
437 		 * If we belong to the group being reported, and
438 		 * we are a 'Delaying member' in the RFC terminology,
439 		 * stop our timer for that group and 'clear flag' i.e.
440 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
441 		 * interfaces on the given physical interface.
442 		 */
443 		mutex_enter(&ill->ill_lock);
444 		for (ipif = ill->ill_ipif; ipif != NULL;
445 		    ipif = ipif->ipif_next) {
446 			ilm = ilm_lookup_ipif(ipif, group);
447 			if (ilm != NULL) {
448 				++igmpstat.igps_rcv_ourreports;
449 				ilm->ilm_timer = INFINITY;
450 				ilm->ilm_state = IGMP_OTHERMEMBER;
451 			}
452 		} /* for */
453 		mutex_exit(&ill->ill_lock);
454 		break;
455 
456 	case IGMP_V3_MEMBERSHIP_REPORT:
457 		/*
458 		 * Currently nothing to do here; IGMP router is not
459 		 * implemented in ip, and v3 hosts don't pay attention
460 		 * to membership reports.
461 		 */
462 		break;
463 	}
464 	/*
465 	 * Pass all valid IGMP packets up to any process(es) listening
466 	 * on a raw IGMP socket. Do not free the packet.
467 	 */
468 	return (mp);
469 
470 bad_pkt:
471 	freemsg(mp);
472 	return (NULL);
473 }
474 
475 static uint_t
476 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
477 {
478 	ilm_t	*ilm;
479 	int	timer;
480 	uint_t	next;
481 
482 	++igmpstat.igps_rcv_queries;
483 
484 	/*
485 	 * In the IGMPv2 specification, there are 3 states and a flag.
486 	 *
487 	 * In Non-Member state, we simply don't have a membership record.
488 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
489 	 * < INFINITY).  In Idle Member state, our timer is not running
490 	 * (ilm->ilm_timer == INFINITY).
491 	 *
492 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
493 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
494 	 * if I sent the last report.
495 	 */
496 	if (igmpa->igmpa_code == 0) {
497 		/*
498 		 * Query from an old router.
499 		 * Remember that the querier on this interface is old,
500 		 * and set the timer to the value in RFC 1112.
501 		 */
502 
503 
504 		mutex_enter(&ill->ill_lock);
505 		ill->ill_mcast_v1_time = 0;
506 		ill->ill_mcast_v1_tset = 1;
507 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
508 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
509 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
510 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
511 			ill->ill_mcast_type = IGMP_V1_ROUTER;
512 		}
513 		mutex_exit(&ill->ill_lock);
514 
515 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
516 
517 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
518 		    igmpa->igmpa_group != 0) {
519 			++igmpstat.igps_rcv_badqueries;
520 			return (0);
521 		}
522 
523 	} else {
524 		in_addr_t group;
525 
526 		/*
527 		 * Query from a new router
528 		 * Simply do a validity check
529 		 */
530 		group = igmpa->igmpa_group;
531 		if (group != 0 && (!CLASSD(group))) {
532 			++igmpstat.igps_rcv_badqueries;
533 			return (0);
534 		}
535 
536 		/*
537 		 * Switch interface state to v2 on receipt of a v2 query
538 		 * ONLY IF current state is v3.  Let things be if current
539 		 * state if v1 but do reset the v2-querier-present timer.
540 		 */
541 		mutex_enter(&ill->ill_lock);
542 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
543 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
544 			    "to IGMP_V2_ROUTER", ill->ill_name));
545 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
546 			ill->ill_mcast_type = IGMP_V2_ROUTER;
547 		}
548 		ill->ill_mcast_v2_time = 0;
549 		ill->ill_mcast_v2_tset = 1;
550 		mutex_exit(&ill->ill_lock);
551 
552 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
553 	}
554 
555 	if (ip_debug > 1) {
556 		mutex_enter(&ill->ill_lock);
557 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
558 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
559 		    (int)ntohs(igmpa->igmpa_code),
560 		    (int)ntohs(igmpa->igmpa_type));
561 		mutex_exit(&ill->ill_lock);
562 	}
563 
564 	/*
565 	 * -Start the timers in all of our membership records
566 	 *  for the physical interface on which the query
567 	 *  arrived, excluding those that belong to the "all
568 	 *  hosts" group (224.0.0.1).
569 	 *
570 	 * -Restart any timer that is already running but has
571 	 *  a value longer than the requested timeout.
572 	 *
573 	 * -Use the value specified in the query message as
574 	 *  the maximum timeout.
575 	 */
576 	next = (unsigned)INFINITY;
577 	mutex_enter(&ill->ill_lock);
578 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
579 
580 		/*
581 		 * A multicast router joins INADDR_ANY address
582 		 * to enable promiscuous reception of all
583 		 * mcasts from the interface. This INADDR_ANY
584 		 * is stored in the ilm_v6addr as V6 unspec addr
585 		 */
586 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
587 			continue;
588 		if (ilm->ilm_addr == htonl(INADDR_ANY))
589 			continue;
590 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
591 		    (igmpa->igmpa_group == 0) ||
592 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
593 			if (ilm->ilm_timer > timer) {
594 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
595 				if (ilm->ilm_timer < next)
596 					next = ilm->ilm_timer;
597 			}
598 		}
599 	}
600 	mutex_exit(&ill->ill_lock);
601 
602 	return (next);
603 }
604 
605 static uint_t
606 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
607 {
608 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
609 	ilm_t		*ilm;
610 	ipaddr_t	*src_array;
611 	uint8_t		qrv;
612 
613 	/* make sure numsrc matches packet size */
614 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
615 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
616 		++igmpstat.igps_rcv_tooshort;
617 		return (0);
618 	}
619 	src_array = (ipaddr_t *)&igmp3qa[1];
620 
621 	++igmpstat.igps_rcv_queries;
622 
623 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
624 		uint_t hdrval, mant, exp;
625 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
626 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
627 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
628 		mrd = (mant | 0x10) << (exp + 3);
629 	}
630 	if (mrd == 0)
631 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
632 	timer = DSEC_TO_MSEC(mrd);
633 	MCAST_RANDOM_DELAY(delay, timer);
634 	next = (unsigned)INFINITY;
635 
636 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
637 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
638 	else
639 		ill->ill_mcast_rv = qrv;
640 
641 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
642 		uint_t hdrval, mant, exp;
643 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
644 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
645 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
646 		qqi = (mant | 0x10) << (exp + 3);
647 	}
648 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
649 
650 	/*
651 	 * If we have a pending general query response that's scheduled
652 	 * sooner than the delay we calculated for this response, then
653 	 * no action is required (RFC3376 section 5.2 rule 1)
654 	 */
655 	mutex_enter(&ill->ill_lock);
656 	if (ill->ill_global_timer < delay) {
657 		mutex_exit(&ill->ill_lock);
658 		return (next);
659 	}
660 	mutex_exit(&ill->ill_lock);
661 
662 	/*
663 	 * Now take action depending upon query type:
664 	 * general, group specific, or group/source specific.
665 	 */
666 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
667 		/*
668 		 * general query
669 		 * We know global timer is either not running or is
670 		 * greater than our calculated delay, so reset it to
671 		 * our delay (random value in range [0, response time]).
672 		 */
673 		mutex_enter(&ill->ill_lock);
674 		ill->ill_global_timer = delay;
675 		next = ill->ill_global_timer;
676 		mutex_exit(&ill->ill_lock);
677 
678 	} else {
679 		/* group or group/source specific query */
680 		mutex_enter(&ill->ill_lock);
681 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
682 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
683 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
684 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
685 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
686 				continue;
687 			/*
688 			 * If the query is group specific or we have a
689 			 * pending group specific query, the response is
690 			 * group specific (pending sources list should be
691 			 * empty).  Otherwise, need to update the pending
692 			 * sources list for the group and source specific
693 			 * response.
694 			 */
695 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
696 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
697 group_query:
698 				FREE_SLIST(ilm->ilm_pendsrcs);
699 				ilm->ilm_pendsrcs = NULL;
700 			} else {
701 				boolean_t overflow;
702 				slist_t *pktl;
703 				if (numsrc > MAX_FILTER_SIZE ||
704 				    (ilm->ilm_pendsrcs == NULL &&
705 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
706 					/*
707 					 * We've been sent more sources than
708 					 * we can deal with; or we can't deal
709 					 * with a source list at all.  Revert
710 					 * to a group specific query.
711 					 */
712 					goto group_query;
713 				}
714 				if ((pktl = l_alloc()) == NULL)
715 					goto group_query;
716 				pktl->sl_numsrc = numsrc;
717 				for (i = 0; i < numsrc; i++)
718 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
719 					    &(pktl->sl_addr[i]));
720 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
721 				    &overflow);
722 				l_free(pktl);
723 				if (overflow)
724 					goto group_query;
725 			}
726 			/* choose soonest timer */
727 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
728 			if (ilm->ilm_timer < next)
729 				next = ilm->ilm_timer;
730 		}
731 		mutex_exit(&ill->ill_lock);
732 	}
733 
734 	return (next);
735 }
736 
737 void
738 igmp_joingroup(ilm_t *ilm)
739 {
740 	ill_t	*ill;
741 
742 	ill = ilm->ilm_ipif->ipif_ill;
743 
744 	ASSERT(IAM_WRITER_ILL(ill));
745 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
746 
747 	mutex_enter(&ill->ill_lock);
748 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
749 		ilm->ilm_rtx.rtx_timer = INFINITY;
750 		ilm->ilm_state = IGMP_OTHERMEMBER;
751 		mutex_exit(&ill->ill_lock);
752 	} else {
753 		ip1dbg(("Querier mode %d, sending report, group %x\n",
754 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
755 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
756 			mutex_exit(&ill->ill_lock);
757 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
758 			mutex_enter(&ill->ill_lock);
759 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
760 			mutex_exit(&ill->ill_lock);
761 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
762 			mutex_enter(&ill->ill_lock);
763 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
764 			mrec_t *rp;
765 			mcast_record_t rtype;
766 			/*
767 			 * The possible state changes we need to handle here:
768 			 *   Old State	New State	Report
769 			 *
770 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
771 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
772 			 *
773 			 * No need to send the BLOCK(0) report; ALLOW(X)
774 			 * is enough.
775 			 */
776 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
777 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
778 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
779 			    ilm->ilm_filter, NULL);
780 			mutex_exit(&ill->ill_lock);
781 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
782 			mutex_enter(&ill->ill_lock);
783 			/*
784 			 * Set up retransmission state.  Timer is set below,
785 			 * for both v3 and older versions.
786 			 */
787 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
788 			    ilm->ilm_filter);
789 		}
790 
791 		/* Set the ilm timer value */
792 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
793 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
794 		ilm->ilm_state = IGMP_IREPORTEDLAST;
795 		mutex_exit(&ill->ill_lock);
796 
797 		/*
798 		 * To avoid deadlock, we don't call igmp_start_timers from
799 		 * here. igmp_start_timers needs to call untimeout, and we
800 		 * can't hold the ipsq across untimeout since
801 		 * igmp_timeout_handler could be blocking trying to
802 		 * acquire the ipsq. Instead we start the timer after we get
803 		 * out of the ipsq in ipsq_exit.
804 		 */
805 		mutex_enter(&igmp_timer_lock);
806 		igmp_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
807 		    igmp_deferred_next);
808 		mutex_exit(&igmp_timer_lock);
809 	}
810 
811 	if (ip_debug > 1) {
812 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
813 		    "igmp_joingroup: multicast_type %d timer %d",
814 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
815 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
816 	}
817 }
818 
819 void
820 mld_joingroup(ilm_t *ilm)
821 {
822 	ill_t	*ill;
823 
824 	ill = ilm->ilm_ill;
825 
826 	ASSERT(IAM_WRITER_ILL(ill));
827 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
828 
829 	mutex_enter(&ill->ill_lock);
830 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
831 		ilm->ilm_rtx.rtx_timer = INFINITY;
832 		ilm->ilm_state = IGMP_OTHERMEMBER;
833 		mutex_exit(&ill->ill_lock);
834 	} else {
835 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
836 			mutex_exit(&ill->ill_lock);
837 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
838 			mutex_enter(&ill->ill_lock);
839 		} else {
840 			mrec_t *rp;
841 			mcast_record_t rtype;
842 			/*
843 			 * The possible state changes we need to handle here:
844 			 *	Old State   New State	Report
845 			 *
846 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
847 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
848 			 *
849 			 * No need to send the BLOCK(0) report; ALLOW(X)
850 			 * is enough
851 			 */
852 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
853 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
854 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
855 			    ilm->ilm_filter, NULL);
856 			mutex_exit(&ill->ill_lock);
857 			mldv2_sendrpt(ill, rp);
858 			mutex_enter(&ill->ill_lock);
859 			/*
860 			 * Set up retransmission state.  Timer is set below,
861 			 * for both v2 and v1.
862 			 */
863 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
864 			    ilm->ilm_filter);
865 		}
866 
867 		/* Set the ilm timer value */
868 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
869 		    ilm->ilm_rtx.rtx_cnt > 0);
870 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
871 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
872 		ilm->ilm_state = IGMP_IREPORTEDLAST;
873 		mutex_exit(&ill->ill_lock);
874 
875 		/*
876 		 * To avoid deadlock, we don't call mld_start_timers from
877 		 * here. mld_start_timers needs to call untimeout, and we
878 		 * can't hold the ipsq (i.e. the lock) across untimeout
879 		 * since mld_timeout_handler could be blocking trying to
880 		 * acquire the ipsq. Instead we start the timer after we get
881 		 * out of the ipsq in ipsq_exit
882 		 */
883 		mutex_enter(&mld_timer_lock);
884 		mld_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
885 		    mld_deferred_next);
886 		mutex_exit(&mld_timer_lock);
887 	}
888 
889 	if (ip_debug > 1) {
890 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
891 		    "mld_joingroup: multicast_type %d timer %d",
892 		    (ilm->ilm_ill->ill_mcast_type),
893 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
894 	}
895 }
896 
897 void
898 igmp_leavegroup(ilm_t *ilm)
899 {
900 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
901 
902 	ASSERT(ilm->ilm_ill == NULL);
903 	ASSERT(!ill->ill_isv6);
904 
905 	mutex_enter(&ill->ill_lock);
906 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
907 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
908 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
909 		mutex_exit(&ill->ill_lock);
910 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
911 		    (htonl(INADDR_ALLRTRS_GROUP)));
912 		return;
913 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
914 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
915 		mrec_t *rp;
916 		/*
917 		 * The possible state changes we need to handle here:
918 		 *	Old State	New State	Report
919 		 *
920 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
921 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
922 		 *
923 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
924 		 */
925 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
926 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
927 			    ilm->ilm_filter, NULL);
928 		} else {
929 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
930 			    NULL, NULL);
931 		}
932 		mutex_exit(&ill->ill_lock);
933 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
934 		return;
935 	}
936 	mutex_exit(&ill->ill_lock);
937 }
938 
939 void
940 mld_leavegroup(ilm_t *ilm)
941 {
942 	ill_t *ill = ilm->ilm_ill;
943 
944 	ASSERT(ilm->ilm_ipif == NULL);
945 	ASSERT(ill->ill_isv6);
946 
947 	mutex_enter(&ill->ill_lock);
948 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
949 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
950 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
951 		mutex_exit(&ill->ill_lock);
952 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
953 		return;
954 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
955 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
956 		mrec_t *rp;
957 		/*
958 		 * The possible state changes we need to handle here:
959 		 *	Old State	New State	Report
960 		 *
961 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
962 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
963 		 *
964 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
965 		 */
966 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
967 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
968 			    ilm->ilm_filter, NULL);
969 		} else {
970 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
971 			    NULL, NULL);
972 		}
973 		mutex_exit(&ill->ill_lock);
974 		mldv2_sendrpt(ill, rp);
975 		return;
976 	}
977 	mutex_exit(&ill->ill_lock);
978 }
979 
980 void
981 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
982 {
983 	ill_t *ill;
984 	mrec_t *rp;
985 
986 	ASSERT(ilm != NULL);
987 
988 	/* state change reports should only be sent if the router is v3 */
989 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
990 		return;
991 
992 	if (ilm->ilm_ill == NULL) {
993 		ASSERT(ilm->ilm_ipif != NULL);
994 		ill = ilm->ilm_ipif->ipif_ill;
995 	} else {
996 		ill = ilm->ilm_ill;
997 	}
998 
999 	mutex_enter(&ill->ill_lock);
1000 
1001 	/*
1002 	 * Compare existing(old) state with the new state and prepare
1003 	 * State Change Report, according to the rules in RFC 3376:
1004 	 *
1005 	 *	Old State	New State	State Change Report
1006 	 *
1007 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1008 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1009 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1010 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1011 	 */
1012 
1013 	if (ilm->ilm_fmode == fmode) {
1014 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1015 		slist_t *allow, *block;
1016 		if (((a_minus_b = l_alloc()) == NULL) ||
1017 		    ((b_minus_a = l_alloc()) == NULL)) {
1018 			l_free(a_minus_b);
1019 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1020 				goto send_to_ex;
1021 			else
1022 				goto send_to_in;
1023 		}
1024 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1025 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1026 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1027 			allow = b_minus_a;
1028 			block = a_minus_b;
1029 		} else {
1030 			allow = a_minus_b;
1031 			block = b_minus_a;
1032 		}
1033 		rp = NULL;
1034 		if (!SLIST_IS_EMPTY(allow))
1035 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1036 			    allow, rp);
1037 		if (!SLIST_IS_EMPTY(block))
1038 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1039 			    block, rp);
1040 		l_free(a_minus_b);
1041 		l_free(b_minus_a);
1042 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1043 send_to_ex:
1044 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1045 		    NULL);
1046 	} else {
1047 send_to_in:
1048 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1049 		    NULL);
1050 	}
1051 
1052 	/*
1053 	 * Need to set up retransmission state; merge the new info with the
1054 	 * current state (which may be null).  If the timer is not currently
1055 	 * running, start it (need to do a delayed start of the timer as
1056 	 * we're currently in the sq).
1057 	 */
1058 	rp = mcast_merge_rtx(ilm, rp, flist);
1059 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1060 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1061 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1062 		mutex_enter(&igmp_timer_lock);
1063 		igmp_deferred_next = MIN(igmp_deferred_next,
1064 		    ilm->ilm_rtx.rtx_timer);
1065 		mutex_exit(&igmp_timer_lock);
1066 	}
1067 
1068 	mutex_exit(&ill->ill_lock);
1069 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1070 }
1071 
1072 void
1073 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1074 {
1075 	ill_t *ill;
1076 	mrec_t *rp = NULL;
1077 
1078 	ASSERT(ilm != NULL);
1079 
1080 	ill = ilm->ilm_ill;
1081 
1082 	/* only need to send if we have an mldv2-capable router */
1083 	mutex_enter(&ill->ill_lock);
1084 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1085 		mutex_exit(&ill->ill_lock);
1086 		return;
1087 	}
1088 
1089 	/*
1090 	 * Compare existing (old) state with the new state passed in
1091 	 * and send appropriate MLDv2 State Change Report.
1092 	 *
1093 	 *	Old State	New State	State Change Report
1094 	 *
1095 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1096 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1097 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1098 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1099 	 */
1100 	if (ilm->ilm_fmode == fmode) {
1101 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1102 		slist_t *allow, *block;
1103 		if (((a_minus_b = l_alloc()) == NULL) ||
1104 		    ((b_minus_a = l_alloc()) == NULL)) {
1105 			l_free(a_minus_b);
1106 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1107 				goto send_to_ex;
1108 			else
1109 				goto send_to_in;
1110 		}
1111 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1112 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1113 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1114 			allow = b_minus_a;
1115 			block = a_minus_b;
1116 		} else {
1117 			allow = a_minus_b;
1118 			block = b_minus_a;
1119 		}
1120 		if (!SLIST_IS_EMPTY(allow))
1121 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1122 			    allow, rp);
1123 		if (!SLIST_IS_EMPTY(block))
1124 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1125 			    block, rp);
1126 		l_free(a_minus_b);
1127 		l_free(b_minus_a);
1128 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1129 send_to_ex:
1130 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1131 		    NULL);
1132 	} else {
1133 send_to_in:
1134 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1135 		    NULL);
1136 	}
1137 
1138 	/*
1139 	 * Need to set up retransmission state; merge the new info with the
1140 	 * current state (which may be null).  If the timer is not currently
1141 	 * running, start it (need to do a deferred start of the timer as
1142 	 * we're currently in the sq).
1143 	 */
1144 	rp = mcast_merge_rtx(ilm, rp, flist);
1145 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1146 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1147 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1148 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1149 		mutex_enter(&mld_timer_lock);
1150 		mld_deferred_next =
1151 		    MIN(mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1152 		mutex_exit(&mld_timer_lock);
1153 	}
1154 
1155 	mutex_exit(&ill->ill_lock);
1156 	mldv2_sendrpt(ill, rp);
1157 }
1158 
1159 uint_t
1160 igmp_timeout_handler_per_ill(ill_t *ill, int elapsed)
1161 {
1162 	uint_t	next = INFINITY;
1163 	ilm_t	*ilm;
1164 	ipif_t	*ipif;
1165 	mrec_t	*rp = NULL;
1166 	mrec_t	*rtxrp = NULL;
1167 	rtx_state_t *rtxp;
1168 	mcast_record_t	rtype;
1169 
1170 	ASSERT(IAM_WRITER_ILL(ill));
1171 
1172 	mutex_enter(&ill->ill_lock);
1173 
1174 	/* First check the global timer on this interface */
1175 	if (ill->ill_global_timer == INFINITY)
1176 		goto per_ilm_timer;
1177 	if (ill->ill_global_timer <= elapsed) {
1178 		ill->ill_global_timer = INFINITY;
1179 		/*
1180 		 * Send report for each group on this interface.
1181 		 * Since we just set the global timer (received a v3 general
1182 		 * query), need to skip the all hosts addr (224.0.0.1), per
1183 		 * RFC 3376 section 5.
1184 		 */
1185 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1186 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1187 				continue;
1188 			ASSERT(ilm->ilm_ipif != NULL);
1189 			ilm->ilm_ipif->ipif_igmp_rpt =
1190 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1191 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1192 			/*
1193 			 * Since we're sending a report on this group, okay
1194 			 * to delete pending group-specific timers.  Note
1195 			 * that group-specific retransmit timers still need
1196 			 * to be checked in the per_ilm_timer for-loop.
1197 			 */
1198 			ilm->ilm_timer = INFINITY;
1199 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1200 			FREE_SLIST(ilm->ilm_pendsrcs);
1201 			ilm->ilm_pendsrcs = NULL;
1202 		}
1203 		/*
1204 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1205 		 * and send a report for each ipif that has an mrec list.
1206 		 */
1207 		for (ipif = ill->ill_ipif; ipif != NULL;
1208 		    ipif = ipif->ipif_next) {
1209 			if (ipif->ipif_igmp_rpt == NULL)
1210 				continue;
1211 			mutex_exit(&ill->ill_lock);
1212 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1213 			mutex_enter(&ill->ill_lock);
1214 			/* mrec list was freed by igmpv3_sendrpt() */
1215 			ipif->ipif_igmp_rpt = NULL;
1216 		}
1217 	} else {
1218 		ill->ill_global_timer -= elapsed;
1219 		if (ill->ill_global_timer < next)
1220 			next = ill->ill_global_timer;
1221 	}
1222 
1223 per_ilm_timer:
1224 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1225 		if (ilm->ilm_timer == INFINITY)
1226 			goto per_ilm_rtxtimer;
1227 
1228 		if (ilm->ilm_timer > elapsed) {
1229 			ilm->ilm_timer -= elapsed;
1230 			if (ilm->ilm_timer < next)
1231 				next = ilm->ilm_timer;
1232 
1233 			if (ip_debug > 1) {
1234 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1235 				    "igmp_timo_hlr 2: ilm_timr %d elap %d "
1236 				    "typ %d nxt %d",
1237 				    (int)ntohl(ilm->ilm_timer), elapsed,
1238 				    (ill->ill_mcast_type), next);
1239 			}
1240 
1241 			goto per_ilm_rtxtimer;
1242 		}
1243 
1244 		/* the timer has expired, need to take action */
1245 		ilm->ilm_timer = INFINITY;
1246 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1247 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1248 			mutex_exit(&ill->ill_lock);
1249 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1250 			mutex_enter(&ill->ill_lock);
1251 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1252 			mutex_exit(&ill->ill_lock);
1253 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1254 			mutex_enter(&ill->ill_lock);
1255 		} else {
1256 			slist_t *rsp;
1257 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1258 			    (rsp = l_alloc()) != NULL) {
1259 				/*
1260 				 * Contents of reply depend on pending
1261 				 * requested source list.
1262 				 */
1263 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1264 					l_intersection(ilm->ilm_filter,
1265 					    ilm->ilm_pendsrcs, rsp);
1266 				} else {
1267 					l_difference(ilm->ilm_pendsrcs,
1268 					    ilm->ilm_filter, rsp);
1269 				}
1270 				FREE_SLIST(ilm->ilm_pendsrcs);
1271 				ilm->ilm_pendsrcs = NULL;
1272 				if (!SLIST_IS_EMPTY(rsp))
1273 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1274 					    &ilm->ilm_v6addr, rsp, rp);
1275 				FREE_SLIST(rsp);
1276 			} else {
1277 				/*
1278 				 * Either the pending request is just group-
1279 				 * specific, or we couldn't get the resources
1280 				 * (rsp) to build a source-specific reply.
1281 				 */
1282 				rp = mcast_bldmrec(ilm->ilm_fmode,
1283 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1284 			}
1285 			mutex_exit(&ill->ill_lock);
1286 			igmpv3_sendrpt(ill->ill_ipif, rp);
1287 			mutex_enter(&ill->ill_lock);
1288 			rp = NULL;
1289 		}
1290 
1291 		if (ip_debug > 1) {
1292 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1293 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1294 			    "typ %d nxt %d",
1295 			    (int)ntohl(ilm->ilm_timer), elapsed,
1296 			    (ill->ill_mcast_type), next);
1297 		}
1298 
1299 per_ilm_rtxtimer:
1300 		rtxp = &ilm->ilm_rtx;
1301 
1302 		if (rtxp->rtx_timer == INFINITY)
1303 			continue;
1304 		if (rtxp->rtx_timer > elapsed) {
1305 			rtxp->rtx_timer -= elapsed;
1306 			if (rtxp->rtx_timer < next)
1307 				next = rtxp->rtx_timer;
1308 			continue;
1309 		}
1310 
1311 		rtxp->rtx_timer = INFINITY;
1312 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1313 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1314 			mutex_exit(&ill->ill_lock);
1315 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1316 			mutex_enter(&ill->ill_lock);
1317 			continue;
1318 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1319 			mutex_exit(&ill->ill_lock);
1320 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1321 			mutex_enter(&ill->ill_lock);
1322 			continue;
1323 		}
1324 
1325 		/*
1326 		 * The retransmit timer has popped, and our router is
1327 		 * IGMPv3.  We have to delve into the retransmit state
1328 		 * stored in the ilm.
1329 		 *
1330 		 * Decrement the retransmit count.  If the fmode rtx
1331 		 * count is active, decrement it, and send a filter
1332 		 * mode change report with the ilm's source list.
1333 		 * Otherwise, send a source list change report with
1334 		 * the current retransmit lists.
1335 		 */
1336 		ASSERT(rtxp->rtx_cnt > 0);
1337 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1338 		rtxp->rtx_cnt--;
1339 		if (rtxp->rtx_fmode_cnt > 0) {
1340 			rtxp->rtx_fmode_cnt--;
1341 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1342 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1343 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1344 			    ilm->ilm_filter, rtxrp);
1345 		} else {
1346 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1347 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1348 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1349 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1350 		}
1351 		if (rtxp->rtx_cnt > 0) {
1352 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1353 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1354 			if (rtxp->rtx_timer < next)
1355 				next = rtxp->rtx_timer;
1356 		} else {
1357 			CLEAR_SLIST(rtxp->rtx_allow);
1358 			CLEAR_SLIST(rtxp->rtx_block);
1359 		}
1360 		mutex_exit(&ill->ill_lock);
1361 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1362 		mutex_enter(&ill->ill_lock);
1363 		rtxrp = NULL;
1364 	}
1365 
1366 	mutex_exit(&ill->ill_lock);
1367 
1368 	return (next);
1369 }
1370 
1371 /*
1372  * igmp_timeout_handler:
1373  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1374  * Returns number of ticks to next event (or 0 if none).
1375  *
1376  * As part of multicast join and leave igmp we may need to send out an
1377  * igmp request. The igmp related state variables in the ilm are protected
1378  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1379  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1380  * starts the igmp timer if needed. It serializes multiple threads trying to
1381  * simultaneously start the timer using the igmp_timer_setter_active flag.
1382  *
1383  * igmp_input() receives igmp queries and responds to the queries
1384  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1385  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1386  * performs the action exclusively after entering each ill's ipsq as writer.
1387  * The actual igmp timeout handler needs to run in the ipsq since it has to
1388  * access the ilm's and we don't want another exclusive operation like
1389  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1390  * another.
1391  *
1392  * The igmp_slowtimeo() function is called thru another timer.
1393  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1394  */
1395 
1396 /* ARGSUSED */
1397 void
1398 igmp_timeout_handler(void *arg)
1399 {
1400 	ill_t	*ill;
1401 	int	elapsed;	/* Since last call */
1402 	uint_t  global_next = INFINITY;
1403 	uint_t  next;
1404 	ill_walk_context_t ctx;
1405 	boolean_t success;
1406 
1407 	mutex_enter(&igmp_timer_lock);
1408 	ASSERT(igmp_timeout_id != 0);
1409 	igmp_timer_fired_last = ddi_get_lbolt();
1410 	elapsed = igmp_time_to_next;
1411 	igmp_time_to_next = 0;
1412 	mutex_exit(&igmp_timer_lock);
1413 
1414 	rw_enter(&ill_g_lock, RW_READER);
1415 	ill = ILL_START_WALK_V4(&ctx);
1416 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1417 		ASSERT(!ill->ill_isv6);
1418 		/*
1419 		 * We may not be able to refhold the ill if the ill/ipif
1420 		 * is changing. But we need to make sure that the ill will
1421 		 * not vanish. So we just bump up the ill_waiter count.
1422 		 */
1423 		if (!ill_waiter_inc(ill))
1424 			continue;
1425 		rw_exit(&ill_g_lock);
1426 		success = ipsq_enter(ill, B_TRUE);
1427 		if (success) {
1428 			next = igmp_timeout_handler_per_ill(ill, elapsed);
1429 			if (next < global_next)
1430 				global_next = next;
1431 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_FALSE,
1432 			    B_TRUE);
1433 		}
1434 		rw_enter(&ill_g_lock, RW_READER);
1435 		ill_waiter_dcr(ill);
1436 	}
1437 	rw_exit(&ill_g_lock);
1438 
1439 	mutex_enter(&igmp_timer_lock);
1440 	ASSERT(igmp_timeout_id != 0);
1441 	igmp_timeout_id = 0;
1442 	mutex_exit(&igmp_timer_lock);
1443 
1444 	if (global_next != INFINITY)
1445 		igmp_start_timers(global_next);
1446 }
1447 
1448 /*
1449  * mld_timeout_handler:
1450  * Called when there are timeout events, every next (tick).
1451  * Returns number of ticks to next event (or 0 if none).
1452  */
1453 /* ARGSUSED */
1454 uint_t
1455 mld_timeout_handler_per_ill(ill_t *ill, int elapsed)
1456 {
1457 	ilm_t 	*ilm;
1458 	uint_t	next = INFINITY;
1459 	mrec_t	*rp, *rtxrp;
1460 	rtx_state_t *rtxp;
1461 	mcast_record_t	rtype;
1462 
1463 	ASSERT(IAM_WRITER_ILL(ill));
1464 
1465 	mutex_enter(&ill->ill_lock);
1466 
1467 	/*
1468 	 * First check the global timer on this interface; the global timer
1469 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1470 	 */
1471 	if (ill->ill_global_timer == INFINITY)
1472 		goto per_ilm_timer;
1473 	if (ill->ill_global_timer <= elapsed) {
1474 		ill->ill_global_timer = INFINITY;
1475 		/*
1476 		 * Send report for each group on this interface.
1477 		 * Since we just set the global timer (received a v2 general
1478 		 * query), need to skip the all hosts addr (ff02::1), per
1479 		 * RFC 3810 section 6.
1480 		 */
1481 		rp = NULL;
1482 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1483 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1484 			    &ipv6_all_hosts_mcast))
1485 				continue;
1486 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1487 			    ilm->ilm_filter, rp);
1488 			/*
1489 			 * Since we're sending a report on this group, okay
1490 			 * to delete pending group-specific timers.  Note
1491 			 * that group-specific retransmit timers still need
1492 			 * to be checked in the per_ilm_timer for-loop.
1493 			 */
1494 			ilm->ilm_timer = INFINITY;
1495 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1496 			FREE_SLIST(ilm->ilm_pendsrcs);
1497 			ilm->ilm_pendsrcs = NULL;
1498 		}
1499 		mutex_exit(&ill->ill_lock);
1500 		mldv2_sendrpt(ill, rp);
1501 		mutex_enter(&ill->ill_lock);
1502 	} else {
1503 		ill->ill_global_timer -= elapsed;
1504 		if (ill->ill_global_timer < next)
1505 			next = ill->ill_global_timer;
1506 	}
1507 
1508 per_ilm_timer:
1509 	rp = rtxrp = NULL;
1510 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1511 		if (ilm->ilm_timer == INFINITY)
1512 			goto per_ilm_rtxtimer;
1513 
1514 		if (ilm->ilm_timer > elapsed) {
1515 			ilm->ilm_timer -= elapsed;
1516 			if (ilm->ilm_timer < next)
1517 				next = ilm->ilm_timer;
1518 
1519 			if (ip_debug > 1) {
1520 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1521 				    "igmp_timo_hlr 2: ilm_timr"
1522 				    " %d elap %d typ %d nxt %d",
1523 				    (int)ntohl(ilm->ilm_timer), elapsed,
1524 				    (ill->ill_mcast_type), next);
1525 			}
1526 
1527 			goto per_ilm_rtxtimer;
1528 		}
1529 
1530 		/* the timer has expired, need to take action */
1531 		ilm->ilm_timer = INFINITY;
1532 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1533 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1534 			mutex_exit(&ill->ill_lock);
1535 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1536 			mutex_enter(&ill->ill_lock);
1537 		} else {
1538 			slist_t *rsp;
1539 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1540 			    (rsp = l_alloc()) != NULL) {
1541 				/*
1542 				 * Contents of reply depend on pending
1543 				 * requested source list.
1544 				 */
1545 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1546 					l_intersection(ilm->ilm_filter,
1547 					    ilm->ilm_pendsrcs, rsp);
1548 				} else {
1549 					l_difference(ilm->ilm_pendsrcs,
1550 					    ilm->ilm_filter, rsp);
1551 				}
1552 				FREE_SLIST(ilm->ilm_pendsrcs);
1553 				ilm->ilm_pendsrcs = NULL;
1554 				if (!SLIST_IS_EMPTY(rsp))
1555 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1556 					    &ilm->ilm_v6addr, rsp, rp);
1557 				FREE_SLIST(rsp);
1558 			} else {
1559 				rp = mcast_bldmrec(ilm->ilm_fmode,
1560 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1561 			}
1562 		}
1563 
1564 		if (ip_debug > 1) {
1565 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1566 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1567 			    "typ %d nxt %d",
1568 			    (int)ntohl(ilm->ilm_timer), elapsed,
1569 			    (ill->ill_mcast_type), next);
1570 		}
1571 
1572 per_ilm_rtxtimer:
1573 		rtxp = &ilm->ilm_rtx;
1574 
1575 		if (rtxp->rtx_timer == INFINITY)
1576 			continue;
1577 		if (rtxp->rtx_timer > elapsed) {
1578 			rtxp->rtx_timer -= elapsed;
1579 			if (rtxp->rtx_timer < next)
1580 				next = rtxp->rtx_timer;
1581 			continue;
1582 		}
1583 
1584 		rtxp->rtx_timer = INFINITY;
1585 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1586 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1587 			mutex_exit(&ill->ill_lock);
1588 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1589 			mutex_enter(&ill->ill_lock);
1590 			continue;
1591 		}
1592 
1593 		/*
1594 		 * The retransmit timer has popped, and our router is
1595 		 * MLDv2.  We have to delve into the retransmit state
1596 		 * stored in the ilm.
1597 		 *
1598 		 * Decrement the retransmit count.  If the fmode rtx
1599 		 * count is active, decrement it, and send a filter
1600 		 * mode change report with the ilm's source list.
1601 		 * Otherwise, send a source list change report with
1602 		 * the current retransmit lists.
1603 		 */
1604 		ASSERT(rtxp->rtx_cnt > 0);
1605 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1606 		rtxp->rtx_cnt--;
1607 		if (rtxp->rtx_fmode_cnt > 0) {
1608 			rtxp->rtx_fmode_cnt--;
1609 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1610 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1611 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1612 			    ilm->ilm_filter, rtxrp);
1613 		} else {
1614 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1615 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1616 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1617 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1618 		}
1619 		if (rtxp->rtx_cnt > 0) {
1620 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1621 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1622 			if (rtxp->rtx_timer < next)
1623 				next = rtxp->rtx_timer;
1624 		} else {
1625 			CLEAR_SLIST(rtxp->rtx_allow);
1626 			CLEAR_SLIST(rtxp->rtx_block);
1627 		}
1628 	}
1629 
1630 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1631 		mutex_exit(&ill->ill_lock);
1632 		mldv2_sendrpt(ill, rp);
1633 		mldv2_sendrpt(ill, rtxrp);
1634 		return (next);
1635 	}
1636 
1637 	mutex_exit(&ill->ill_lock);
1638 
1639 	return (next);
1640 }
1641 
1642 /*
1643  * mld_timeout_handler:
1644  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1645  * Returns number of ticks to next event (or 0 if none).
1646  * MT issues are same as igmp_timeout_handler
1647  */
1648 /* ARGSUSED */
1649 void
1650 mld_timeout_handler(void *arg)
1651 {
1652 	ill_t	*ill;
1653 	int	elapsed;	/* Since last call */
1654 	uint_t  global_next = INFINITY;
1655 	uint_t  next;
1656 	ill_walk_context_t ctx;
1657 	boolean_t success;
1658 
1659 	mutex_enter(&mld_timer_lock);
1660 	ASSERT(mld_timeout_id != 0);
1661 	mld_timer_fired_last = ddi_get_lbolt();
1662 	elapsed = mld_time_to_next;
1663 	mld_time_to_next = 0;
1664 	mutex_exit(&mld_timer_lock);
1665 
1666 	rw_enter(&ill_g_lock, RW_READER);
1667 	ill = ILL_START_WALK_V6(&ctx);
1668 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1669 		ASSERT(ill->ill_isv6);
1670 		/*
1671 		 * We may not be able to refhold the ill if the ill/ipif
1672 		 * is changing. But we need to make sure that the ill will
1673 		 * not vanish. So we just bump up the ill_waiter count.
1674 		 */
1675 		if (!ill_waiter_inc(ill))
1676 			continue;
1677 		rw_exit(&ill_g_lock);
1678 		success = ipsq_enter(ill, B_TRUE);
1679 		if (success) {
1680 			next = mld_timeout_handler_per_ill(ill, elapsed);
1681 			if (next < global_next)
1682 				global_next = next;
1683 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE,
1684 			    B_FALSE);
1685 		}
1686 		rw_enter(&ill_g_lock, RW_READER);
1687 		ill_waiter_dcr(ill);
1688 	}
1689 	rw_exit(&ill_g_lock);
1690 
1691 	mutex_enter(&mld_timer_lock);
1692 	ASSERT(mld_timeout_id != 0);
1693 	mld_timeout_id = 0;
1694 	mutex_exit(&mld_timer_lock);
1695 
1696 	if (global_next != INFINITY)
1697 		mld_start_timers(global_next);
1698 }
1699 
1700 /*
1701  * Calculate the Older Version Querier Present timeout value, in number
1702  * of slowtimo intervals, for the given ill.
1703  */
1704 #define	OVQP(ill) \
1705 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1706 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1707 
1708 /*
1709  * igmp_slowtimo:
1710  * - Resets to new router if we didnt we hear from the router
1711  *   in IGMP_AGE_THRESHOLD seconds.
1712  * - Resets slowtimeout.
1713  */
1714 /* ARGSUSED */
1715 void
1716 igmp_slowtimo(void *arg)
1717 {
1718 	ill_t	*ill;
1719 	ill_if_t *ifp;
1720 	avl_tree_t *avl_tree;
1721 
1722 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1723 	rw_enter(&ill_g_lock, RW_READER);
1724 
1725 	/*
1726 	 * The ill_if_t list is circular, hence the odd loop parameters.
1727 	 *
1728 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1729 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1730 	 * structure (allowing us to skip if none of the instances have timers
1731 	 * running).
1732 	 */
1733 	for (ifp = IP_V4_ILL_G_LIST; ifp != (ill_if_t *)&IP_V4_ILL_G_LIST;
1734 	    ifp = ifp->illif_next) {
1735 		/*
1736 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1737 		 * a V1 or V2 query now and we miss seeing the count now,
1738 		 * we will see it the next time igmp_slowtimo is called.
1739 		 */
1740 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1741 			continue;
1742 
1743 		avl_tree = &ifp->illif_avl_by_ppa;
1744 		for (ill = avl_first(avl_tree); ill != NULL;
1745 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1746 			mutex_enter(&ill->ill_lock);
1747 			if (ill->ill_mcast_v1_tset == 1)
1748 				ill->ill_mcast_v1_time++;
1749 			if (ill->ill_mcast_v2_tset == 1)
1750 				ill->ill_mcast_v2_time++;
1751 			if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1752 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1753 					if (ill->ill_mcast_v2_tset > 0) {
1754 						ip1dbg(("V1 query timer "
1755 						    "expired on %s; switching "
1756 						    "mode to IGMP_V2\n",
1757 						    ill->ill_name));
1758 						ill->ill_mcast_type =
1759 						    IGMP_V2_ROUTER;
1760 					} else {
1761 						ip1dbg(("V1 query timer "
1762 						    "expired on %s; switching "
1763 						    "mode to IGMP_V3\n",
1764 						    ill->ill_name));
1765 						ill->ill_mcast_type =
1766 						    IGMP_V3_ROUTER;
1767 					}
1768 					ill->ill_mcast_v1_time = 0;
1769 					ill->ill_mcast_v1_tset = 0;
1770 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1771 				}
1772 			}
1773 			if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1774 				if (ill->ill_mcast_v2_time >= OVQP(ill)) {
1775 					ip1dbg(("V2 query timer expired on "
1776 					    "%s; switching mode to IGMP_V3\n",
1777 					    ill->ill_name));
1778 					ill->ill_mcast_type = IGMP_V3_ROUTER;
1779 					ill->ill_mcast_v2_time = 0;
1780 					ill->ill_mcast_v2_tset = 0;
1781 					atomic_add_16(&ifp->illif_mcast_v2, -1);
1782 				}
1783 			}
1784 			mutex_exit(&ill->ill_lock);
1785 		}
1786 
1787 	}
1788 	rw_exit(&ill_g_lock);
1789 	mutex_enter(&igmp_slowtimeout_lock);
1790 	igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL,
1791 		MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1792 	mutex_exit(&igmp_slowtimeout_lock);
1793 }
1794 
1795 /*
1796  * mld_slowtimo:
1797  * - Resets to newer version if we didn't hear from the older version router
1798  *   in MLD_AGE_THRESHOLD seconds.
1799  * - Restarts slowtimeout.
1800  */
1801 /* ARGSUSED */
1802 void
1803 mld_slowtimo(void *arg)
1804 {
1805 	ill_t *ill;
1806 	ill_if_t *ifp;
1807 	avl_tree_t *avl_tree;
1808 
1809 	/* See comments in igmp_slowtimo() above... */
1810 	rw_enter(&ill_g_lock, RW_READER);
1811 	for (ifp = IP_V6_ILL_G_LIST; ifp != (ill_if_t *)&IP_V6_ILL_G_LIST;
1812 	    ifp = ifp->illif_next) {
1813 
1814 		if (ifp->illif_mcast_v1 == 0)
1815 			continue;
1816 
1817 		avl_tree = &ifp->illif_avl_by_ppa;
1818 		for (ill = avl_first(avl_tree); ill != NULL;
1819 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1820 			mutex_enter(&ill->ill_lock);
1821 			if (ill->ill_mcast_v1_tset == 1)
1822 				ill->ill_mcast_v1_time++;
1823 			if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1824 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1825 					ip1dbg(("MLD query timer expired on"
1826 					    " %s; switching mode to MLD_V2\n",
1827 					    ill->ill_name));
1828 					ill->ill_mcast_type = MLD_V2_ROUTER;
1829 					ill->ill_mcast_v1_time = 0;
1830 					ill->ill_mcast_v1_tset = 0;
1831 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1832 				}
1833 			}
1834 			mutex_exit(&ill->ill_lock);
1835 		}
1836 	}
1837 	rw_exit(&ill_g_lock);
1838 	mutex_enter(&mld_slowtimeout_lock);
1839 	mld_slowtimeout_id = timeout(mld_slowtimo, NULL,
1840 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1841 	mutex_exit(&mld_slowtimeout_lock);
1842 }
1843 
1844 /*
1845  * igmp_sendpkt:
1846  * This will send to ip_wput like icmp_inbound.
1847  * Note that the lower ill (on which the membership is kept) is used
1848  * as an upper ill to pass in the multicast parameters.
1849  */
1850 static void
1851 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1852 {
1853 	mblk_t	*mp;
1854 	igmpa_t	*igmpa;
1855 	uint8_t *rtralert;
1856 	ipha_t	*ipha;
1857 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1858 	size_t	size  = hdrlen + sizeof (igmpa_t);
1859 	ipif_t 	*ipif = ilm->ilm_ipif;
1860 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1861 	mblk_t	*first_mp;
1862 	ipsec_out_t *io;
1863 	zoneid_t zoneid;
1864 
1865 	/*
1866 	 * We need to make sure this packet goes out on an ipif. If
1867 	 * there is some global policy match in ip_wput_ire, we need
1868 	 * to get to the right interface after IPSEC processing.
1869 	 * To make sure this multicast packet goes out on the right
1870 	 * interface, we attach an ipsec_out and initialize ill_index
1871 	 * like we did in ip_wput. To make sure that this packet does
1872 	 * not get forwarded on other interfaces or looped back, we
1873 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1874 	 * to B_FALSE.
1875 	 *
1876 	 * We also need to make sure that this does not get load balanced
1877 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1878 	 * here. If it gets load balanced, switches supporting igmp snooping
1879 	 * will send the packet that it receives for this multicast group
1880 	 * to the interface that we are sending on. As we have joined the
1881 	 * multicast group on this ill, by sending the packet out on this
1882 	 * ill, we receive all the packets back on this ill.
1883 	 */
1884 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1885 	if (first_mp == NULL)
1886 		return;
1887 
1888 	first_mp->b_datap->db_type = M_CTL;
1889 	first_mp->b_wptr += sizeof (ipsec_info_t);
1890 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1891 	/* ipsec_out_secure is B_FALSE now */
1892 	io = (ipsec_out_t *)first_mp->b_rptr;
1893 	io->ipsec_out_type = IPSEC_OUT;
1894 	io->ipsec_out_len = sizeof (ipsec_out_t);
1895 	io->ipsec_out_use_global_policy = B_TRUE;
1896 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1897 	io->ipsec_out_attach_if = B_TRUE;
1898 	io->ipsec_out_multicast_loop = B_FALSE;
1899 	io->ipsec_out_dontroute = B_TRUE;
1900 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
1901 		zoneid = GLOBAL_ZONEID;
1902 	io->ipsec_out_zoneid = zoneid;
1903 
1904 	mp = allocb(size, BPRI_HI);
1905 	if (mp == NULL) {
1906 		freemsg(first_mp);
1907 		return;
1908 	}
1909 	mp->b_wptr = mp->b_rptr + size;
1910 	first_mp->b_cont = mp;
1911 
1912 	ipha = (ipha_t *)mp->b_rptr;
1913 	rtralert = (uint8_t *)&(ipha[1]);
1914 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1915 	igmpa->igmpa_type   = type;
1916 	igmpa->igmpa_code   = 0;
1917 	igmpa->igmpa_group  = ilm->ilm_addr;
1918 	igmpa->igmpa_cksum  = 0;
1919 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1920 	if (igmpa->igmpa_cksum == 0)
1921 		igmpa->igmpa_cksum = 0xffff;
1922 
1923 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1924 	rtralert[1] = RTRALERT_LEN;
1925 	rtralert[2] = 0;
1926 	rtralert[3] = 0;
1927 
1928 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1929 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1930 	ipha->ipha_type_of_service 	= 0;
1931 	ipha->ipha_length = htons(size);
1932 	ipha->ipha_ident = 0;
1933 	ipha->ipha_fragment_offset_and_flags = 0;
1934 	ipha->ipha_ttl 		= IGMP_TTL;
1935 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1936 	ipha->ipha_hdr_checksum 	= 0;
1937 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1938 	ipha->ipha_src 		= ipif->ipif_src_addr;
1939 	/*
1940 	 * Request loopback of the report if we are acting as a multicast
1941 	 * router, so that the process-level routing demon can hear it.
1942 	 */
1943 	/*
1944 	 * This will run multiple times for the same group if there are members
1945 	 * on the same group for multiple ipif's on the same ill. The
1946 	 * igmp_input code will suppress this due to the loopback thus we
1947 	 * always loopback membership report.
1948 	 */
1949 	ASSERT(ill->ill_rq != NULL);
1950 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1951 
1952 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
1953 
1954 	++igmpstat.igps_snd_reports;
1955 }
1956 
1957 /*
1958  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1959  * with the passed-in ipif.  The report will contain one group record
1960  * for each element of reclist.  If this causes packet length to
1961  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1962  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1963  * and those buffers are freed here.
1964  */
1965 static void
1966 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1967 {
1968 	ipsec_out_t *io;
1969 	igmp3ra_t *igmp3ra;
1970 	grphdra_t *grphdr;
1971 	mblk_t *first_mp, *mp;
1972 	ipha_t *ipha;
1973 	uint8_t *rtralert;
1974 	ipaddr_t *src_array;
1975 	int i, j, numrec, more_src_cnt;
1976 	size_t hdrsize, size, rsize;
1977 	ill_t *ill = ipif->ipif_ill;
1978 	mrec_t *rp, *cur_reclist;
1979 	mrec_t *next_reclist = reclist;
1980 	boolean_t morepkts;
1981 	zoneid_t zoneid;
1982 
1983 	/* if there aren't any records, there's nothing to send */
1984 	if (reclist == NULL)
1985 		return;
1986 
1987 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1988 nextpkt:
1989 	size = hdrsize + sizeof (igmp3ra_t);
1990 	morepkts = B_FALSE;
1991 	more_src_cnt = 0;
1992 	cur_reclist = next_reclist;
1993 	numrec = 0;
1994 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1995 		rsize = sizeof (grphdra_t) +
1996 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
1997 		if (size + rsize > ill->ill_max_frag) {
1998 			if (rp == cur_reclist) {
1999 				/*
2000 				 * If the first mrec we looked at is too big
2001 				 * to fit in a single packet (i.e the source
2002 				 * list is too big), we must either truncate
2003 				 * the list (if TO_EX or IS_EX), or send
2004 				 * multiple reports for the same group (all
2005 				 * other types).
2006 				 */
2007 				int srcspace, srcsperpkt;
2008 				srcspace = ill->ill_max_frag - (size +
2009 				    sizeof (grphdra_t));
2010 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2011 				/*
2012 				 * Increment size and numrec, because we will
2013 				 * be sending a record for the mrec we're
2014 				 * looking at now.
2015 				 */
2016 				size += sizeof (grphdra_t) +
2017 				    (srcsperpkt * sizeof (ipaddr_t));
2018 				numrec++;
2019 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2020 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2021 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2022 					if (rp->mrec_next == NULL) {
2023 						/* no more packets to send */
2024 						break;
2025 					} else {
2026 						/*
2027 						 * more packets, but we're
2028 						 * done with this mrec.
2029 						 */
2030 						next_reclist = rp->mrec_next;
2031 					}
2032 				} else {
2033 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2034 					    - srcsperpkt;
2035 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2036 					/*
2037 					 * We'll fix up this mrec (remove the
2038 					 * srcs we've already sent) before
2039 					 * returning to nextpkt above.
2040 					 */
2041 					next_reclist = rp;
2042 				}
2043 			} else {
2044 				next_reclist = rp;
2045 			}
2046 			morepkts = B_TRUE;
2047 			break;
2048 		}
2049 		size += rsize;
2050 		numrec++;
2051 	}
2052 
2053 	/*
2054 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2055 	 * load balancing requirements.
2056 	 */
2057 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2058 	if (first_mp == NULL)
2059 		goto free_reclist;
2060 
2061 	first_mp->b_datap->db_type = M_CTL;
2062 	first_mp->b_wptr += sizeof (ipsec_info_t);
2063 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2064 	/* ipsec_out_secure is B_FALSE now */
2065 	io = (ipsec_out_t *)first_mp->b_rptr;
2066 	io->ipsec_out_type = IPSEC_OUT;
2067 	io->ipsec_out_len = sizeof (ipsec_out_t);
2068 	io->ipsec_out_use_global_policy = B_TRUE;
2069 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2070 	io->ipsec_out_attach_if = B_TRUE;
2071 	io->ipsec_out_multicast_loop = B_FALSE;
2072 	io->ipsec_out_dontroute = B_TRUE;
2073 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
2074 		zoneid = GLOBAL_ZONEID;
2075 	io->ipsec_out_zoneid = zoneid;
2076 
2077 	mp = allocb(size, BPRI_HI);
2078 	if (mp == NULL) {
2079 		freemsg(first_mp);
2080 		goto free_reclist;
2081 	}
2082 	bzero((char *)mp->b_rptr, size);
2083 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2084 	first_mp->b_cont = mp;
2085 
2086 	ipha = (ipha_t *)mp->b_rptr;
2087 	rtralert = (uint8_t *)&(ipha[1]);
2088 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2089 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2090 
2091 	rp = cur_reclist;
2092 	for (i = 0; i < numrec; i++) {
2093 		grphdr->grphdra_type = rp->mrec_type;
2094 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2095 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2096 		src_array = (ipaddr_t *)&(grphdr[1]);
2097 
2098 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2099 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2100 
2101 		grphdr = (grphdra_t *)&(src_array[j]);
2102 		rp = rp->mrec_next;
2103 	}
2104 
2105 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2106 	igmp3ra->igmp3ra_numrec = htons(numrec);
2107 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2108 
2109 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2110 	rtralert[1] = RTRALERT_LEN;
2111 	rtralert[2] = 0;
2112 	rtralert[3] = 0;
2113 
2114 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2115 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2116 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2117 	ipha->ipha_length = htons(size);
2118 	ipha->ipha_ttl = IGMP_TTL;
2119 	ipha->ipha_protocol = IPPROTO_IGMP;
2120 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2121 	ipha->ipha_src = ipif->ipif_src_addr;
2122 
2123 	/*
2124 	 * Request loopback of the report if we are acting as a multicast
2125 	 * router, so that the process-level routing daemon can hear it.
2126 	 *
2127 	 * This will run multiple times for the same group if there are
2128 	 * members on the same group for multiple ipifs on the same ill.
2129 	 * The igmp_input code will suppress this due to the loopback;
2130 	 * thus we always loopback membership report.
2131 	 */
2132 	ASSERT(ill->ill_rq != NULL);
2133 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2134 
2135 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
2136 
2137 	++igmpstat.igps_snd_reports;
2138 
2139 	if (morepkts) {
2140 		if (more_src_cnt > 0) {
2141 			int index, mvsize;
2142 			slist_t *sl = &next_reclist->mrec_srcs;
2143 			index = sl->sl_numsrc;
2144 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2145 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2146 			    mvsize);
2147 			sl->sl_numsrc = more_src_cnt;
2148 		}
2149 		goto nextpkt;
2150 	}
2151 
2152 free_reclist:
2153 	while (reclist != NULL) {
2154 		rp = reclist->mrec_next;
2155 		mi_free(reclist);
2156 		reclist = rp;
2157 	}
2158 }
2159 
2160 /*
2161  * mld_input:
2162  */
2163 /* ARGSUSED */
2164 void
2165 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2166 {
2167 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2168 	mld_hdr_t	*mldh;
2169 	ilm_t		*ilm;
2170 	ipif_t		*ipif;
2171 	uint16_t	hdr_length, exthdr_length;
2172 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2173 	uint_t		next;
2174 	int		mldlen;
2175 
2176 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2177 
2178 	/* Make sure the src address of the packet is link-local */
2179 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2180 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2181 		freemsg(mp);
2182 		return;
2183 	}
2184 
2185 	if (ip6h->ip6_hlim != 1) {
2186 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2187 		freemsg(mp);
2188 		return;
2189 	}
2190 
2191 	/* Get to the icmp header part */
2192 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2193 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2194 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2195 	} else {
2196 		hdr_length = IPV6_HDR_LEN;
2197 		exthdr_length = 0;
2198 	}
2199 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2200 
2201 	/* An MLD packet must at least be 24 octets to be valid */
2202 	if (mldlen < MLD_MINLEN) {
2203 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2204 		freemsg(mp);
2205 		return;
2206 	}
2207 
2208 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2209 
2210 	switch (mldh->mld_type) {
2211 	case MLD_LISTENER_QUERY:
2212 		/*
2213 		 * packet length differentiates between v1 and v2.  v1
2214 		 * query should be exactly 24 octets long; v2 is >= 28.
2215 		 */
2216 		if (mldlen == MLD_MINLEN) {
2217 			next = mld_query_in(mldh, ill);
2218 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2219 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2220 		} else {
2221 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2222 			freemsg(mp);
2223 			return;
2224 		}
2225 		if (next == 0) {
2226 			freemsg(mp);
2227 			return;
2228 		}
2229 
2230 		if (next != INFINITY)
2231 			mld_start_timers(next);
2232 		break;
2233 
2234 	case MLD_LISTENER_REPORT: {
2235 
2236 		ASSERT(ill->ill_ipif != NULL);
2237 		/*
2238 		 * For fast leave to work, we have to know that we are the
2239 		 * last person to send a report for this group.  Reports
2240 		 * generated by us are looped back since we could potentially
2241 		 * be a multicast router, so discard reports sourced by me.
2242 		 */
2243 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2244 		mutex_enter(&ill->ill_lock);
2245 		for (ipif = ill->ill_ipif; ipif != NULL;
2246 		    ipif = ipif->ipif_next) {
2247 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2248 			    lcladdr_ptr)) {
2249 				if (ip_debug > 1) {
2250 					char    buf1[INET6_ADDRSTRLEN];
2251 					char	buf2[INET6_ADDRSTRLEN];
2252 
2253 					(void) mi_strlog(ill->ill_rq,
2254 					    1,
2255 					    SL_TRACE,
2256 					    "mld_input: we are only "
2257 					    "member src %s ipif_local %s",
2258 					    inet_ntop(AF_INET6, lcladdr_ptr,
2259 					    buf1, sizeof (buf1)),
2260 					    inet_ntop(AF_INET6,
2261 					    &ipif->ipif_v6lcl_addr,
2262 					    buf2, sizeof (buf2)));
2263 				}
2264 				mutex_exit(&ill->ill_lock);
2265 				freemsg(mp);
2266 				return;
2267 			}
2268 		}
2269 		mutex_exit(&ill->ill_lock);
2270 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2271 
2272 		v6group_ptr = &mldh->mld_addr;
2273 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2274 			BUMP_MIB(ill->ill_icmp6_mib,
2275 			    ipv6IfIcmpInGroupMembBadReports);
2276 			freemsg(mp);
2277 			return;
2278 		}
2279 
2280 
2281 		/*
2282 		 * If we belong to the group being reported, and we are a
2283 		 * 'Delaying member' per the RFC terminology, stop our timer
2284 		 * for that group and 'clear flag' i.e. mark ilm_state as
2285 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2286 		 * membership entries for the same group address (one per zone)
2287 		 * so we need to walk the ill_ilm list.
2288 		 */
2289 		mutex_enter(&ill->ill_lock);
2290 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2291 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2292 			    continue;
2293 			BUMP_MIB(ill->ill_icmp6_mib,
2294 			    ipv6IfIcmpInGroupMembOurReports);
2295 
2296 			ilm->ilm_timer = INFINITY;
2297 			ilm->ilm_state = IGMP_OTHERMEMBER;
2298 		}
2299 		mutex_exit(&ill->ill_lock);
2300 		break;
2301 	}
2302 	case MLD_LISTENER_REDUCTION:
2303 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2304 		break;
2305 	}
2306 	/*
2307 	 * All MLD packets have already been passed up to any
2308 	 * process(es) listening on a ICMP6 raw socket. This
2309 	 * has been accomplished in ip_deliver_local_v6 prior to
2310 	 * this function call. It is assumed that the multicast daemon
2311 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2312 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2313 	 * Thus we can free the MLD message block here
2314 	 */
2315 	freemsg(mp);
2316 }
2317 
2318 /*
2319  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2320  * (non-zero, unsigned) timer value to be set on success.
2321  */
2322 static uint_t
2323 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2324 {
2325 	ilm_t	*ilm;
2326 	int	timer;
2327 	uint_t	next;
2328 	in6_addr_t *v6group;
2329 
2330 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2331 
2332 	/*
2333 	 * In the MLD specification, there are 3 states and a flag.
2334 	 *
2335 	 * In Non-Listener state, we simply don't have a membership record.
2336 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2337 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2338 	 * INFINITY)
2339 	 *
2340 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2341 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2342 	 * if I sent the last report.
2343 	 */
2344 	v6group = &mldh->mld_addr;
2345 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2346 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2347 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2348 		return (0);
2349 	}
2350 
2351 	/* Need to do compatibility mode checking */
2352 	mutex_enter(&ill->ill_lock);
2353 	ill->ill_mcast_v1_time = 0;
2354 	ill->ill_mcast_v1_tset = 1;
2355 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2356 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2357 		    "MLD_V1_ROUTER\n", ill->ill_name));
2358 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2359 		ill->ill_mcast_type = MLD_V1_ROUTER;
2360 	}
2361 	mutex_exit(&ill->ill_lock);
2362 
2363 	timer = (int)ntohs(mldh->mld_maxdelay);
2364 	if (ip_debug > 1) {
2365 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2366 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2367 		    timer, (int)mldh->mld_type);
2368 	}
2369 
2370 	/*
2371 	 * -Start the timers in all of our membership records for
2372 	 * the physical interface on which the query arrived,
2373 	 * excl:
2374 	 *	1.  those that belong to the "all hosts" group,
2375 	 *	2.  those with 0 scope, or 1 node-local scope.
2376 	 *
2377 	 * -Restart any timer that is already running but has a value
2378 	 * longer that the requested timeout.
2379 	 * -Use the value specified in the query message as the
2380 	 * maximum timeout.
2381 	 */
2382 	next = INFINITY;
2383 	mutex_enter(&ill->ill_lock);
2384 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2385 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2386 
2387 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2388 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2389 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2390 			continue;
2391 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2392 		    &ipv6_all_hosts_mcast)) &&
2393 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2394 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2395 			if (timer == 0) {
2396 				/* Respond immediately */
2397 				ilm->ilm_timer = INFINITY;
2398 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2399 				mutex_exit(&ill->ill_lock);
2400 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2401 				mutex_enter(&ill->ill_lock);
2402 				break;
2403 			}
2404 			if (ilm->ilm_timer > timer) {
2405 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2406 				if (ilm->ilm_timer < next)
2407 					next = ilm->ilm_timer;
2408 			}
2409 			break;
2410 		}
2411 	}
2412 	mutex_exit(&ill->ill_lock);
2413 
2414 	return (next);
2415 }
2416 
2417 /*
2418  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2419  * returns the appropriate (non-zero, unsigned) timer value (which may
2420  * be INFINITY) to be set.
2421  */
2422 static uint_t
2423 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2424 {
2425 	ilm_t	*ilm;
2426 	in6_addr_t *v6group, *src_array;
2427 	uint_t	next, numsrc, i, mrd, delay, qqi;
2428 	uint8_t	qrv;
2429 
2430 	v6group = &mld2q->mld2q_addr;
2431 	numsrc = ntohs(mld2q->mld2q_numsrc);
2432 
2433 	/* make sure numsrc matches packet size */
2434 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2435 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2436 		return (0);
2437 	}
2438 	src_array = (in6_addr_t *)&mld2q[1];
2439 
2440 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2441 
2442 	/* extract Maximum Response Delay from code in header */
2443 	mrd = ntohs(mld2q->mld2q_mxrc);
2444 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2445 		uint_t hdrval, mant, exp;
2446 		hdrval = mrd;
2447 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2448 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2449 		mrd = (mant | 0x1000) << (exp + 3);
2450 	}
2451 	MCAST_RANDOM_DELAY(delay, mrd);
2452 	next = (unsigned)INFINITY;
2453 
2454 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2455 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2456 	else
2457 		ill->ill_mcast_rv = qrv;
2458 
2459 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2460 		uint_t mant, exp;
2461 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2462 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2463 		qqi = (mant | 0x10) << (exp + 3);
2464 	}
2465 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2466 
2467 	/*
2468 	 * If we have a pending general query response that's scheduled
2469 	 * sooner than the delay we calculated for this response, then
2470 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2471 	 */
2472 	mutex_enter(&ill->ill_lock);
2473 	if (ill->ill_global_timer < delay) {
2474 		mutex_exit(&ill->ill_lock);
2475 		return (next);
2476 	}
2477 	mutex_exit(&ill->ill_lock);
2478 
2479 	/*
2480 	 * Now take action depending on query type: general,
2481 	 * group specific, or group/source specific.
2482 	 */
2483 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2484 		/*
2485 		 * general query
2486 		 * We know global timer is either not running or is
2487 		 * greater than our calculated delay, so reset it to
2488 		 * our delay (random value in range [0, response time])
2489 		 */
2490 		mutex_enter(&ill->ill_lock);
2491 		ill->ill_global_timer = delay;
2492 		next = ill->ill_global_timer;
2493 		mutex_exit(&ill->ill_lock);
2494 
2495 	} else {
2496 		/* group or group/source specific query */
2497 		mutex_enter(&ill->ill_lock);
2498 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2499 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2500 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2501 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2502 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2503 				continue;
2504 
2505 			/*
2506 			 * If the query is group specific or we have a
2507 			 * pending group specific query, the response is
2508 			 * group specific (pending sources list should be
2509 			 * empty).  Otherwise, need to update the pending
2510 			 * sources list for the group and source specific
2511 			 * response.
2512 			 */
2513 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2514 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2515 group_query:
2516 				FREE_SLIST(ilm->ilm_pendsrcs);
2517 				ilm->ilm_pendsrcs = NULL;
2518 			} else {
2519 				boolean_t overflow;
2520 				slist_t *pktl;
2521 				if (numsrc > MAX_FILTER_SIZE ||
2522 				    (ilm->ilm_pendsrcs == NULL &&
2523 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2524 					/*
2525 					 * We've been sent more sources than
2526 					 * we can deal with; or we can't deal
2527 					 * with a source list at all. Revert
2528 					 * to a group specific query.
2529 					 */
2530 					goto group_query;
2531 				}
2532 				if ((pktl = l_alloc()) == NULL)
2533 					goto group_query;
2534 				pktl->sl_numsrc = numsrc;
2535 				for (i = 0; i < numsrc; i++)
2536 					pktl->sl_addr[i] = src_array[i];
2537 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2538 				    &overflow);
2539 				l_free(pktl);
2540 				if (overflow)
2541 					goto group_query;
2542 			}
2543 			/* set timer to soonest value */
2544 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2545 			if (ilm->ilm_timer < next)
2546 				next = ilm->ilm_timer;
2547 			break;
2548 		}
2549 		mutex_exit(&ill->ill_lock);
2550 	}
2551 
2552 	return (next);
2553 }
2554 
2555 /*
2556  * Send MLDv1 response packet with hoplimit 1
2557  */
2558 static void
2559 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2560 {
2561 	mblk_t		*mp;
2562 	mld_hdr_t	*mldh;
2563 	ip6_t 		*ip6h;
2564 	ip6_hbh_t	*ip6hbh;
2565 	struct ip6_opt_router	*ip6router;
2566 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2567 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2568 	ipif_t		*ipif;
2569 	ip6i_t		*ip6i;
2570 
2571 	/*
2572 	 * We need to place a router alert option in this packet.  The length
2573 	 * of the options must be a multiple of 8.  The hbh option header is 2
2574 	 * bytes followed by the 4 byte router alert option.  That leaves
2575 	 * 2 bytes of pad for a total of 8 bytes.
2576 	 */
2577 	const int	router_alert_length = 8;
2578 
2579 	ASSERT(ill->ill_isv6);
2580 
2581 	/*
2582 	 * We need to make sure that this packet does not get load balanced.
2583 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2584 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2585 	 * If it gets load balanced, switches supporting MLD snooping
2586 	 * (in the future) will send the packet that it receives for this
2587 	 * multicast group to the interface that we are sending on. As we have
2588 	 * joined the multicast group on this ill, by sending the packet out
2589 	 * on this ill, we receive all the packets back on this ill.
2590 	 */
2591 	size += sizeof (ip6i_t) + router_alert_length;
2592 	mp = allocb(size, BPRI_HI);
2593 	if (mp == NULL)
2594 		return;
2595 	bzero(mp->b_rptr, size);
2596 	mp->b_wptr = mp->b_rptr + size;
2597 
2598 	ip6i = (ip6i_t *)mp->b_rptr;
2599 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2600 	ip6i->ip6i_nxt = IPPROTO_RAW;
2601 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2602 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2603 
2604 	ip6h = (ip6_t *)&ip6i[1];
2605 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2606 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2607 	/*
2608 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2609 	 * above will pad between ip6router and mld.
2610 	 */
2611 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2612 
2613 	mldh->mld_type = type;
2614 	mldh->mld_addr = ilm->ilm_v6addr;
2615 
2616 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2617 	ip6router->ip6or_len = 2;
2618 	ip6router->ip6or_value[0] = 0;
2619 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2620 
2621 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2622 	ip6hbh->ip6h_len = 0;
2623 
2624 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2625 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2626 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2627 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2628 	if (v6addr == NULL)
2629 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2630 	else
2631 		ip6h->ip6_dst = *v6addr;
2632 
2633 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2634 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2635 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2636 		ipif_refrele(ipif);
2637 	} else {
2638 		/* Otherwise, use IPv6 default address selection. */
2639 		ip6h->ip6_src = ipv6_all_zeros;
2640 	}
2641 
2642 	/*
2643 	 * Prepare for checksum by putting icmp length in the icmp
2644 	 * checksum field. The checksum is calculated in ip_wput_v6.
2645 	 */
2646 	mldh->mld_cksum = htons(sizeof (*mldh));
2647 
2648 	/*
2649 	 * ip_wput will automatically loopback the multicast packet to
2650 	 * the conn if multicast loopback is enabled.
2651 	 * The MIB stats corresponding to this outgoing MLD packet
2652 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2653 	 * ->icmp_update_out_mib_v6 function call.
2654 	 */
2655 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2656 }
2657 
2658 /*
2659  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2660  * report will contain one multicast address record for each element of
2661  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2662  * multiple reports are sent.  reclist is assumed to be made up of
2663  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2664  */
2665 static void
2666 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2667 {
2668 	mblk_t		*mp;
2669 	mld2r_t		*mld2r;
2670 	mld2mar_t	*mld2mar;
2671 	in6_addr_t	*srcarray;
2672 	ip6_t		*ip6h;
2673 	ip6_hbh_t	*ip6hbh;
2674 	ip6i_t		*ip6i;
2675 	struct ip6_opt_router	*ip6router;
2676 	size_t		size, optlen, padlen, icmpsize, rsize;
2677 	ipif_t		*ipif;
2678 	int		i, numrec, more_src_cnt;
2679 	mrec_t		*rp, *cur_reclist;
2680 	mrec_t		*next_reclist = reclist;
2681 	boolean_t	morepkts;
2682 
2683 	/* If there aren't any records, there's nothing to send */
2684 	if (reclist == NULL)
2685 		return;
2686 
2687 	ASSERT(ill->ill_isv6);
2688 
2689 	/*
2690 	 * Total option length (optlen + padlen) must be a multiple of
2691 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2692 	 * length will be 8.  Assert this in case anything ever changes.
2693 	 */
2694 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2695 	ASSERT(optlen <= 8);
2696 	padlen = 8 - optlen;
2697 nextpkt:
2698 	icmpsize = sizeof (mld2r_t);
2699 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2700 	morepkts = B_FALSE;
2701 	more_src_cnt = 0;
2702 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2703 	    rp = rp->mrec_next, numrec++) {
2704 		rsize = sizeof (mld2mar_t) +
2705 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2706 		if (size + rsize > ill->ill_max_frag) {
2707 			if (rp == cur_reclist) {
2708 				/*
2709 				 * If the first mrec we looked at is too big
2710 				 * to fit in a single packet (i.e the source
2711 				 * list is too big), we must either truncate
2712 				 * the list (if TO_EX or IS_EX), or send
2713 				 * multiple reports for the same group (all
2714 				 * other types).
2715 				 */
2716 				int srcspace, srcsperpkt;
2717 				srcspace = ill->ill_max_frag -
2718 				    (size + sizeof (mld2mar_t));
2719 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2720 				/*
2721 				 * Increment icmpsize and size, because we will
2722 				 * be sending a record for the mrec we're
2723 				 * looking at now.
2724 				 */
2725 				rsize = sizeof (mld2mar_t) +
2726 				    (srcsperpkt * sizeof (in6_addr_t));
2727 				icmpsize += rsize;
2728 				size += rsize;
2729 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2730 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2731 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2732 					if (rp->mrec_next == NULL) {
2733 						/* no more packets to send */
2734 						break;
2735 					} else {
2736 						/*
2737 						 * more packets, but we're
2738 						 * done with this mrec.
2739 						 */
2740 						next_reclist = rp->mrec_next;
2741 					}
2742 				} else {
2743 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2744 					    - srcsperpkt;
2745 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2746 					/*
2747 					 * We'll fix up this mrec (remove the
2748 					 * srcs we've already sent) before
2749 					 * returning to nextpkt above.
2750 					 */
2751 					next_reclist = rp;
2752 				}
2753 			} else {
2754 				next_reclist = rp;
2755 			}
2756 			morepkts = B_TRUE;
2757 			break;
2758 		}
2759 		icmpsize += rsize;
2760 		size += rsize;
2761 	}
2762 
2763 	/*
2764 	 * We need to make sure that this packet does not get load balanced.
2765 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2766 	 * ip_newroute_ipif_v6 know how to handle such packets.
2767 	 * If it gets load balanced, switches supporting MLD snooping
2768 	 * (in the future) will send the packet that it receives for this
2769 	 * multicast group to the interface that we are sending on. As we have
2770 	 * joined the multicast group on this ill, by sending the packet out
2771 	 * on this ill, we receive all the packets back on this ill.
2772 	 */
2773 	size += sizeof (ip6i_t);
2774 	mp = allocb(size, BPRI_HI);
2775 	if (mp == NULL)
2776 		goto free_reclist;
2777 	bzero(mp->b_rptr, size);
2778 	mp->b_wptr = mp->b_rptr + size;
2779 
2780 	ip6i = (ip6i_t *)mp->b_rptr;
2781 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2782 	ip6i->ip6i_nxt = IPPROTO_RAW;
2783 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2784 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2785 
2786 	ip6h = (ip6_t *)&(ip6i[1]);
2787 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2788 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2789 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2790 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2791 
2792 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2793 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2794 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2795 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2796 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2797 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2798 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2799 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2800 		ipif_refrele(ipif);
2801 	} else {
2802 		/* otherwise, use IPv6 default address selection. */
2803 		ip6h->ip6_src = ipv6_all_zeros;
2804 	}
2805 
2806 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2807 	/*
2808 	 * ip6h_len is the number of 8-byte words, not including the first
2809 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2810 	 */
2811 	ip6hbh->ip6h_len = 0;
2812 
2813 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2814 	ip6router->ip6or_len = 2;
2815 	ip6router->ip6or_value[0] = 0;
2816 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2817 
2818 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2819 	mld2r->mld2r_nummar = htons(numrec);
2820 	/*
2821 	 * Prepare for the checksum by putting icmp length in the icmp
2822 	 * checksum field. The checksum is calculated in ip_wput_v6.
2823 	 */
2824 	mld2r->mld2r_cksum = htons(icmpsize);
2825 
2826 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2827 		mld2mar->mld2mar_type = rp->mrec_type;
2828 		mld2mar->mld2mar_auxlen = 0;
2829 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2830 		mld2mar->mld2mar_group = rp->mrec_group;
2831 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2832 
2833 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2834 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2835 
2836 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2837 	}
2838 
2839 	/*
2840 	 * ip_wput will automatically loopback the multicast packet to
2841 	 * the conn if multicast loopback is enabled.
2842 	 * The MIB stats corresponding to this outgoing MLD packet
2843 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2844 	 * ->icmp_update_out_mib_v6 function call.
2845 	 */
2846 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2847 
2848 	if (morepkts) {
2849 		if (more_src_cnt > 0) {
2850 			int index, mvsize;
2851 			slist_t *sl = &next_reclist->mrec_srcs;
2852 			index = sl->sl_numsrc;
2853 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2854 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2855 			    mvsize);
2856 			sl->sl_numsrc = more_src_cnt;
2857 		}
2858 		goto nextpkt;
2859 	}
2860 
2861 free_reclist:
2862 	while (reclist != NULL) {
2863 		rp = reclist->mrec_next;
2864 		mi_free(reclist);
2865 		reclist = rp;
2866 	}
2867 }
2868 
2869 static mrec_t *
2870 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2871     mrec_t *next)
2872 {
2873 	mrec_t *rp;
2874 	int i;
2875 
2876 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2877 	    SLIST_IS_EMPTY(srclist))
2878 		return (next);
2879 
2880 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2881 	if (rp == NULL)
2882 		return (next);
2883 
2884 	rp->mrec_next = next;
2885 	rp->mrec_type = type;
2886 	rp->mrec_auxlen = 0;
2887 	rp->mrec_group = *grp;
2888 	if (srclist == NULL) {
2889 		rp->mrec_srcs.sl_numsrc = 0;
2890 	} else {
2891 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2892 		for (i = 0; i < srclist->sl_numsrc; i++)
2893 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2894 	}
2895 
2896 	return (rp);
2897 }
2898 
2899 /*
2900  * Set up initial retransmit state.  If memory cannot be allocated for
2901  * the source lists, simply create as much state as is possible; memory
2902  * allocation failures are considered one type of transient error that
2903  * the retransmissions are designed to overcome (and if they aren't
2904  * transient, there are bigger problems than failing to notify the
2905  * router about multicast group membership state changes).
2906  */
2907 static void
2908 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2909     slist_t *flist)
2910 {
2911 	/*
2912 	 * There are only three possibilities for rtype:
2913 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2914 	 *	  => rtype is ALLOW_NEW_SOURCES
2915 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2916 	 *	  => rtype is CHANGE_TO_EXCLUDE
2917 	 *	State change that involves a filter mode change
2918 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2919 	 */
2920 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2921 	    rtype == ALLOW_NEW_SOURCES);
2922 
2923 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2924 
2925 	switch (rtype) {
2926 	case CHANGE_TO_EXCLUDE:
2927 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2928 		CLEAR_SLIST(rtxp->rtx_allow);
2929 		COPY_SLIST(flist, rtxp->rtx_block);
2930 		break;
2931 	case ALLOW_NEW_SOURCES:
2932 	case CHANGE_TO_INCLUDE:
2933 		rtxp->rtx_fmode_cnt =
2934 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2935 		CLEAR_SLIST(rtxp->rtx_block);
2936 		COPY_SLIST(flist, rtxp->rtx_allow);
2937 		break;
2938 	}
2939 }
2940 
2941 /*
2942  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2943  * RFC 3376 section 5.1, covers three cases:
2944  *	* The current state change is a filter mode change
2945  *		Set filter mode retransmit counter; set retransmit allow or
2946  *		block list to new source list as appropriate, and clear the
2947  *		retransmit list that was not set; send TO_IN or TO_EX with
2948  *		new source list.
2949  *	* The current state change is a source list change, but the filter
2950  *	  mode retransmit counter is > 0
2951  *		Decrement filter mode retransmit counter; set retransmit
2952  *		allow or block list to  new source list as appropriate,
2953  *		and clear the retransmit list that was not set; send TO_IN
2954  *		or TO_EX with new source list.
2955  *	* The current state change is a source list change, and the filter
2956  *	  mode retransmit counter is 0.
2957  *		Merge existing rtx allow and block lists with new state:
2958  *		  rtx_allow = (new allow + rtx_allow) - new block
2959  *		  rtx_block = (new block + rtx_block) - new allow
2960  *		Send ALLOW and BLOCK records for new retransmit lists;
2961  *		decrement retransmit counter.
2962  *
2963  * As is the case for mcast_init_rtx(), memory allocation failures are
2964  * acceptable; we just create as much state as we can.
2965  */
2966 static mrec_t *
2967 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2968 {
2969 	ill_t *ill;
2970 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2971 	mcast_record_t txtype;
2972 	mrec_t *rp, *rpnext, *rtnmrec;
2973 	boolean_t ovf;
2974 
2975 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
2976 
2977 	if (mreclist == NULL)
2978 		return (mreclist);
2979 
2980 	/*
2981 	 * A filter mode change is indicated by a single mrec, which is
2982 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2983 	 * retransmit state as if this were an initial join.  There is
2984 	 * no change to the mrec list.
2985 	 */
2986 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2987 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2988 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2989 		    &mreclist->mrec_srcs);
2990 		return (mreclist);
2991 	}
2992 
2993 	/*
2994 	 * Only the source list has changed
2995 	 */
2996 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2997 	if (rtxp->rtx_fmode_cnt > 0) {
2998 		/* but we're still sending filter mode change reports */
2999 		rtxp->rtx_fmode_cnt--;
3000 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3001 			CLEAR_SLIST(rtxp->rtx_block);
3002 			COPY_SLIST(flist, rtxp->rtx_allow);
3003 			txtype = CHANGE_TO_INCLUDE;
3004 		} else {
3005 			CLEAR_SLIST(rtxp->rtx_allow);
3006 			COPY_SLIST(flist, rtxp->rtx_block);
3007 			txtype = CHANGE_TO_EXCLUDE;
3008 		}
3009 		/* overwrite first mrec with new info */
3010 		mreclist->mrec_type = txtype;
3011 		l_copy(flist, &mreclist->mrec_srcs);
3012 		/* then free any remaining mrecs */
3013 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3014 			rpnext = rp->mrec_next;
3015 			mi_free(rp);
3016 		}
3017 		mreclist->mrec_next = NULL;
3018 		rtnmrec = mreclist;
3019 	} else {
3020 		mrec_t *allow_mrec, *block_mrec;
3021 		/*
3022 		 * Just send the source change reports; but we need to
3023 		 * recalculate the ALLOW and BLOCK lists based on previous
3024 		 * state and new changes.
3025 		 */
3026 		rtnmrec = mreclist;
3027 		allow_mrec = block_mrec = NULL;
3028 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3029 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3030 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3031 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3032 				allow_mrec = rp;
3033 			else
3034 				block_mrec = rp;
3035 		}
3036 		/*
3037 		 * Perform calculations:
3038 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3039 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3040 		 *
3041 		 * Each calc requires two steps, for example:
3042 		 *   rtx_allow = rtx_allow - mrec_block;
3043 		 *   new_allow = mrec_allow + rtx_allow;
3044 		 *
3045 		 * Store results in mrec lists, and then copy into rtx lists.
3046 		 * We do it in this order in case the rtx list hasn't been
3047 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3048 		 * Overflows are also okay.
3049 		 */
3050 		if (block_mrec != NULL) {
3051 			l_difference_in_a(rtxp->rtx_allow,
3052 			    &block_mrec->mrec_srcs);
3053 		}
3054 		if (allow_mrec != NULL) {
3055 			l_difference_in_a(rtxp->rtx_block,
3056 			    &allow_mrec->mrec_srcs);
3057 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3058 			    &ovf);
3059 		}
3060 		if (block_mrec != NULL) {
3061 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3062 			    &ovf);
3063 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3064 		} else {
3065 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3066 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3067 		}
3068 		if (allow_mrec != NULL) {
3069 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3070 		} else {
3071 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3072 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3073 		}
3074 	}
3075 
3076 	return (rtnmrec);
3077 }
3078