xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ire.c (revision c3ea2840)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * This file contains routines that manipulate Internet Routing Entries (IREs).
29  */
30 
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/stropts.h>
34 #include <sys/strsun.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/policy.h>
38 
39 #include <sys/systm.h>
40 #include <sys/kmem.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <net/if.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <net/if_dl.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
49 
50 #include <inet/common.h>
51 #include <inet/mi.h>
52 #include <inet/ip.h>
53 #include <inet/ip6.h>
54 #include <inet/ip_ndp.h>
55 #include <inet/arp.h>
56 #include <inet/ip_if.h>
57 #include <inet/ip_ire.h>
58 #include <inet/ip_ftable.h>
59 #include <inet/ip_rts.h>
60 #include <inet/nd.h>
61 
62 #include <net/pfkeyv2.h>
63 #include <inet/ipsec_info.h>
64 #include <inet/sadb.h>
65 #include <inet/tcp.h>
66 #include <inet/ipclassifier.h>
67 #include <sys/zone.h>
68 #include <sys/cpuvar.h>
69 
70 #include <sys/tsol/label.h>
71 #include <sys/tsol/tnet.h>
72 
73 struct kmem_cache *rt_entry_cache;
74 
75 /*
76  * Synchronization notes:
77  *
78  * The fields of the ire_t struct are protected in the following way :
79  *
80  * ire_next/ire_ptpn
81  *
82  *	- bucket lock of the respective tables (cache or forwarding tables).
83  *
84  * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask,
85  * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif,
86  * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr
87  *
88  *	- Set in ire_create_v4/v6 and never changes after that. Thus,
89  *	  we don't need a lock whenever these fields are accessed.
90  *
91  *	- ire_bucket and ire_masklen (also set in ire_create) is set in
92  *        ire_add_v4/ire_add_v6 before inserting in the bucket and never
93  *        changes after that. Thus we don't need a lock whenever these
94  *	  fields are accessed.
95  *
96  * ire_gateway_addr_v4[v6]
97  *
98  *	- ire_gateway_addr_v4[v6] is set during ire_create and later modified
99  *	  by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to
100  *	  it assumed to be atomic and hence the other parts of the code
101  *	  does not use any locks. ire_gateway_addr_v6 updates are not atomic
102  *	  and hence any access to it uses ire_lock to get/set the right value.
103  *
104  * ire_ident, ire_refcnt
105  *
106  *	- Updated atomically using atomic_add_32
107  *
108  * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count
109  *
110  *	- Assumes that 32 bit writes are atomic. No locks. ire_lock is
111  *	  used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
112  *
113  * ire_max_frag, ire_frag_flag
114  *
115  *	- ire_lock is used to set/read both of them together.
116  *
117  * ire_tire_mark
118  *
119  *	- Set in ire_create and updated in ire_expire, which is called
120  *	  by only one function namely ip_trash_timer_expire. Thus only
121  *	  one function updates and examines the value.
122  *
123  * ire_marks
124  *	- bucket lock protects this.
125  *
126  * ire_ipsec_overhead/ire_ll_hdr_length
127  *
128  *	- Place holder for returning the information to the upper layers
129  *	  when IRE_DB_REQ comes down.
130  *
131  *
132  * ipv6_ire_default_count is protected by the bucket lock of
133  * ip_forwarding_table_v6[0][0].
134  *
135  * ipv6_ire_default_index is not protected as it  is just a hint
136  * at which default gateway to use. There is nothing
137  * wrong in using the same gateway for two different connections.
138  *
139  * As we always hold the bucket locks in all the places while accessing
140  * the above values, it is natural to use them for protecting them.
141  *
142  * We have a separate cache table and forwarding table for IPv4 and IPv6.
143  * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an
144  * array of irb_t structures. The IPv6 forwarding table
145  * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
146  *  structure. ip_forwarding_table_v6 is allocated dynamically in
147  * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
148  * initializing the same bucket. Once a bucket is initialized, it is never
149  * de-alloacted. This assumption enables us to access
150  * ip_forwarding_table_v6[i] without any locks.
151  *
152  * The forwarding table for IPv4 is a radix tree whose leaves
153  * are rt_entry structures containing the irb_t for the rt_dst. The irb_t
154  * for IPv4 is dynamically allocated and freed.
155  *
156  * Each irb_t - ire bucket structure has a lock to protect
157  * a bucket and the ires residing in the bucket have a back pointer to
158  * the bucket structure. It also has a reference count for the number
159  * of threads walking the bucket - irb_refcnt which is bumped up
160  * using the macro IRB_REFHOLD macro. The flags irb_flags can be
161  * set to IRE_MARK_CONDEMNED indicating that there are some ires
162  * in this bucket that are marked with IRE_MARK_CONDEMNED and the
163  * last thread to leave the bucket should delete the ires. Usually
164  * this is done by the IRB_REFRELE macro which is used to decrement
165  * the reference count on a bucket. See comments above irb_t structure
166  * definition in ip.h for further details.
167  *
168  * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/
169  * decrements the reference count, ire_refcnt, atomically on the ire.
170  * ire_refcnt is modified only using this macro. Operations on the IRE
171  * could be described as follows :
172  *
173  * CREATE an ire with reference count initialized to 1.
174  *
175  * ADDITION of an ire holds the bucket lock, checks for duplicates
176  * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after
177  * bumping up once more i.e the reference count is 2. This is to avoid
178  * an extra lookup in the functions calling ire_add which wants to
179  * work with the ire after adding.
180  *
181  * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD
182  * macro. It is valid to bump up the referece count of the IRE,
183  * after the lookup has returned an ire. Following are the lookup
184  * functions that return an HELD ire :
185  *
186  * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6],
187  * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6],
188  * ipif_to_ire[_v6].
189  *
190  * DELETION of an ire holds the bucket lock, removes it from the list
191  * and then decrements the reference count for having removed from the list
192  * by using the IRE_REFRELE macro. If some other thread has looked up
193  * the ire, the reference count would have been bumped up and hence
194  * this ire will not be freed once deleted. It will be freed once the
195  * reference count drops to zero.
196  *
197  * Add and Delete acquires the bucket lock as RW_WRITER, while all the
198  * lookups acquire the bucket lock as RW_READER.
199  *
200  * NOTE : The only functions that does the IRE_REFRELE when an ire is
201  *	  passed as an argument are :
202  *
203  *	  1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the
204  *			   broadcast ires it looks up internally within
205  *			   the function. Currently, for simplicity it does
206  *			   not differentiate the one that is passed in and
207  *			   the ones it looks up internally. It always
208  *			   IRE_REFRELEs.
209  *	  2) ire_send
210  *	     ire_send_v6 : As ire_send calls ip_wput_ire and other functions
211  *			   that take ire as an argument, it has to selectively
212  *			   IRE_REFRELE the ire. To maintain symmetry,
213  *			   ire_send_v6 does the same.
214  *
215  * Otherwise, the general rule is to do the IRE_REFRELE in the function
216  * that is passing the ire as an argument.
217  *
218  * In trying to locate ires the following points are to be noted.
219  *
220  * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
221  * to be ignored when walking the ires using ire_next.
222  *
223  * Zones note:
224  *	Walking IREs within a given zone also walks certain ires in other
225  *	zones.  This is done intentionally.  IRE walks with a specified
226  *	zoneid are used only when doing informational reports, and
227  *	zone users want to see things that they can access. See block
228  *	comment in ire_walk_ill_match().
229  */
230 
231 /*
232  * The minimum size of IRE cache table.  It will be recalcuated in
233  * ip_ire_init().
234  * Setable in /etc/system
235  */
236 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE;
237 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
238 
239 /*
240  * The size of the forwarding table.  We will make sure that it is a
241  * power of 2 in ip_ire_init().
242  * Setable in /etc/system
243  */
244 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
245 
246 struct	kmem_cache	*ire_cache;
247 static ire_t	ire_null;
248 
249 /*
250  * The threshold number of IRE in a bucket when the IREs are
251  * cleaned up.  This threshold is calculated later in ip_open()
252  * based on the speed of CPU and available memory.  This default
253  * value is the maximum.
254  *
255  * We have two kinds of cached IRE, temporary and
256  * non-temporary.  Temporary IREs are marked with
257  * IRE_MARK_TEMPORARY.  They are IREs created for non
258  * TCP traffic and for forwarding purposes.  All others
259  * are non-temporary IREs.  We don't mark IRE created for
260  * TCP as temporary because TCP is stateful and there are
261  * info stored in the IRE which can be shared by other TCP
262  * connections to the same destination.  For connected
263  * endpoint, we also don't want to mark the IRE used as
264  * temporary because the same IRE will be used frequently,
265  * otherwise, the app should not do a connect().  We change
266  * the marking at ip_bind_connected_*() if necessary.
267  *
268  * We want to keep the cache IRE hash bucket length reasonably
269  * short, otherwise IRE lookup functions will take "forever."
270  * We use the "crude" function that the IRE bucket
271  * length should be based on the CPU speed, which is 1 entry
272  * per x MHz, depending on the shift factor ip_ire_cpu_ratio
273  * (n).  This means that with a 750MHz CPU, the max bucket
274  * length can be (750 >> n) entries.
275  *
276  * Note that this threshold is separate for temp and non-temp
277  * IREs.  This means that the actual bucket length can be
278  * twice as that.  And while we try to keep temporary IRE
279  * length at most at the threshold value, we do not attempt to
280  * make the length for non-temporary IREs fixed, for the
281  * reason stated above.  Instead, we start trying to find
282  * "unused" non-temporary IREs when the bucket length reaches
283  * this threshold and clean them up.
284  *
285  * We also want to limit the amount of memory used by
286  * IREs.  So if we are allowed to use ~3% of memory (M)
287  * for those IREs, each bucket should not have more than
288  *
289  * 	M / num of cache bucket / sizeof (ire_t)
290  *
291  * Again the above memory uses are separate for temp and
292  * non-temp cached IREs.
293  *
294  * We may also want the limit to be a function of the number
295  * of interfaces and number of CPUs.  Doing the initialization
296  * in ip_open() means that every time an interface is plumbed,
297  * the max is re-calculated.  Right now, we don't do anything
298  * different.  In future, when we have more experience, we
299  * may want to change this behavior.
300  */
301 uint32_t ip_ire_max_bucket_cnt = 10;	/* Setable in /etc/system */
302 uint32_t ip6_ire_max_bucket_cnt = 10;
303 uint32_t ip_ire_cleanup_cnt = 2;
304 
305 /*
306  * The minimum of the temporary IRE bucket count.  We do not want
307  * the length of each bucket to be too short.  This may hurt
308  * performance of some apps as the temporary IREs are removed too
309  * often.
310  */
311 uint32_t ip_ire_min_bucket_cnt = 3;	/* /etc/system - not used */
312 uint32_t ip6_ire_min_bucket_cnt = 3;
313 
314 /*
315  * The ratio of memory consumed by IRE used for temporary to available
316  * memory.  This is a shift factor, so 6 means the ratio 1 to 64.  This
317  * value can be changed in /etc/system.  6 is a reasonable number.
318  */
319 uint32_t ip_ire_mem_ratio = 6;	/* /etc/system */
320 /* The shift factor for CPU speed to calculate the max IRE bucket length. */
321 uint32_t ip_ire_cpu_ratio = 7;	/* /etc/system */
322 
323 typedef struct nce_clookup_s {
324 	ipaddr_t ncecl_addr;
325 	boolean_t ncecl_found;
326 } nce_clookup_t;
327 
328 /*
329  * The maximum number of buckets in IRE cache table.  In future, we may
330  * want to make it a dynamic hash table.  For the moment, we fix the
331  * size and allocate the table in ip_ire_init() when IP is first loaded.
332  * We take into account the amount of memory a system has.
333  */
334 #define	IP_MAX_CACHE_TABLE_SIZE	4096
335 
336 /* Setable in /etc/system */
337 static uint32_t	ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
338 static uint32_t	ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
339 
340 /* Zero iulp_t for initialization. */
341 const iulp_t	ire_uinfo_null = { 0 };
342 
343 static int	ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp,
344     ipsq_func_t func, boolean_t);
345 static void	ire_delete_v4(ire_t *ire);
346 static void	ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
347     zoneid_t zoneid, ip_stack_t *);
348 static void	ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
349     pfv_t func, void *arg, uchar_t vers, ill_t *ill);
350 static void	ire_cache_cleanup(irb_t *irb, uint32_t threshold,
351     ire_t *ref_ire);
352 static	void	ip_nce_clookup_and_delete(nce_t *nce, void *arg);
353 static	ire_t	*ip4_ctable_lookup_impl(ire_ctable_args_t *margs);
354 #ifdef DEBUG
355 static void	ire_trace_cleanup(const ire_t *);
356 #endif
357 
358 /*
359  * To avoid bloating the code, we call this function instead of
360  * using the macro IRE_REFRELE. Use macro only in performance
361  * critical paths.
362  *
363  * Must not be called while holding any locks. Otherwise if this is
364  * the last reference to be released there is a chance of recursive mutex
365  * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
366  * to restart an ioctl. The one exception is when the caller is sure that
367  * this is not the last reference to be released. Eg. if the caller is
368  * sure that the ire has not been deleted and won't be deleted.
369  */
370 void
371 ire_refrele(ire_t *ire)
372 {
373 	IRE_REFRELE(ire);
374 }
375 
376 void
377 ire_refrele_notr(ire_t *ire)
378 {
379 	IRE_REFRELE_NOTR(ire);
380 }
381 
382 /*
383  * kmem_cache_alloc constructor for IRE in kma space.
384  * Note that when ire_mp is set the IRE is stored in that mblk and
385  * not in this cache.
386  */
387 /* ARGSUSED */
388 static int
389 ip_ire_constructor(void *buf, void *cdrarg, int kmflags)
390 {
391 	ire_t	*ire = buf;
392 
393 	ire->ire_nce = NULL;
394 
395 	return (0);
396 }
397 
398 /* ARGSUSED1 */
399 static void
400 ip_ire_destructor(void *buf, void *cdrarg)
401 {
402 	ire_t	*ire = buf;
403 
404 	ASSERT(ire->ire_nce == NULL);
405 }
406 
407 /*
408  * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY
409  * IOCTL.  It is used by TCP (or other ULPs) to supply revised information
410  * for an existing CACHED IRE.
411  */
412 /* ARGSUSED */
413 int
414 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
415 {
416 	uchar_t	*addr_ucp;
417 	ipic_t	*ipic;
418 	ire_t	*ire;
419 	ipaddr_t	addr;
420 	in6_addr_t	v6addr;
421 	irb_t	*irb;
422 	zoneid_t	zoneid;
423 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
424 
425 	ASSERT(q->q_next == NULL);
426 	zoneid = Q_TO_CONN(q)->conn_zoneid;
427 
428 	/*
429 	 * Check privilege using the ioctl credential; if it is NULL
430 	 * then this is a kernel message and therefor privileged.
431 	 */
432 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
433 		return (EPERM);
434 
435 	ipic = (ipic_t *)mp->b_rptr;
436 	if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset,
437 	    ipic->ipic_addr_length))) {
438 		return (EINVAL);
439 	}
440 	if (!OK_32PTR(addr_ucp))
441 		return (EINVAL);
442 	switch (ipic->ipic_addr_length) {
443 	case IP_ADDR_LEN: {
444 		/* Extract the destination address. */
445 		addr = *(ipaddr_t *)addr_ucp;
446 		/* Find the corresponding IRE. */
447 		ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
448 		break;
449 	}
450 	case IPV6_ADDR_LEN: {
451 		/* Extract the destination address. */
452 		v6addr = *(in6_addr_t *)addr_ucp;
453 		/* Find the corresponding IRE. */
454 		ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst);
455 		break;
456 	}
457 	default:
458 		return (EINVAL);
459 	}
460 
461 	if (ire == NULL)
462 		return (ENOENT);
463 	/*
464 	 * Update the round trip time estimate and/or the max frag size
465 	 * and/or the slow start threshold.
466 	 *
467 	 * We serialize multiple advises using ire_lock.
468 	 */
469 	mutex_enter(&ire->ire_lock);
470 	if (ipic->ipic_rtt) {
471 		/*
472 		 * If there is no old cached values, initialize them
473 		 * conservatively.  Set them to be (1.5 * new value).
474 		 */
475 		if (ire->ire_uinfo.iulp_rtt != 0) {
476 			ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt +
477 			    ipic->ipic_rtt) >> 1;
478 		} else {
479 			ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt +
480 			    (ipic->ipic_rtt >> 1);
481 		}
482 		if (ire->ire_uinfo.iulp_rtt_sd != 0) {
483 			ire->ire_uinfo.iulp_rtt_sd =
484 			    (ire->ire_uinfo.iulp_rtt_sd +
485 			    ipic->ipic_rtt_sd) >> 1;
486 		} else {
487 			ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd +
488 			    (ipic->ipic_rtt_sd >> 1);
489 		}
490 	}
491 	if (ipic->ipic_max_frag)
492 		ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET);
493 	if (ipic->ipic_ssthresh != 0) {
494 		if (ire->ire_uinfo.iulp_ssthresh != 0)
495 			ire->ire_uinfo.iulp_ssthresh =
496 			    (ipic->ipic_ssthresh +
497 			    ire->ire_uinfo.iulp_ssthresh) >> 1;
498 		else
499 			ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh;
500 	}
501 	/*
502 	 * Don't need the ire_lock below this. ire_type does not change
503 	 * after initialization. ire_marks is protected by irb_lock.
504 	 */
505 	mutex_exit(&ire->ire_lock);
506 
507 	if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) {
508 		/*
509 		 * Only increment the temporary IRE count if the original
510 		 * IRE is not already marked temporary.
511 		 */
512 		irb = ire->ire_bucket;
513 		rw_enter(&irb->irb_lock, RW_WRITER);
514 		if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) &&
515 		    !(ire->ire_marks & IRE_MARK_TEMPORARY)) {
516 			irb->irb_tmp_ire_cnt++;
517 		}
518 		ire->ire_marks |= ipic->ipic_ire_marks;
519 		rw_exit(&irb->irb_lock);
520 	}
521 
522 	ire_refrele(ire);
523 	return (0);
524 }
525 
526 /*
527  * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
528  * IOCTL[s].  The NO_REPLY form is used by TCP to delete a route IRE
529  * for a host that is not responding.  This will force an attempt to
530  * establish a new route, if available, and flush out the ARP entry so
531  * it will re-resolve.  Management processes may want to use the
532  * version that generates a reply.
533  *
534  * This function does not support IPv6 since Neighbor Unreachability Detection
535  * means that negative advise like this is useless.
536  */
537 /* ARGSUSED */
538 int
539 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
540 {
541 	uchar_t		*addr_ucp;
542 	ipaddr_t	addr;
543 	ire_t		*ire;
544 	ipid_t		*ipid;
545 	boolean_t	routing_sock_info = B_FALSE;	/* Sent info? */
546 	zoneid_t	zoneid;
547 	ire_t		*gire = NULL;
548 	ill_t		*ill;
549 	mblk_t		*arp_mp;
550 	ip_stack_t	*ipst;
551 
552 	ASSERT(q->q_next == NULL);
553 	zoneid = Q_TO_CONN(q)->conn_zoneid;
554 	ipst = CONNQ_TO_IPST(q);
555 
556 	/*
557 	 * Check privilege using the ioctl credential; if it is NULL
558 	 * then this is a kernel message and therefor privileged.
559 	 */
560 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
561 		return (EPERM);
562 
563 	ipid = (ipid_t *)mp->b_rptr;
564 
565 	/* Only actions on IRE_CACHEs are acceptable at present. */
566 	if (ipid->ipid_ire_type != IRE_CACHE)
567 		return (EINVAL);
568 
569 	addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
570 	    ipid->ipid_addr_length);
571 	if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
572 		return (EINVAL);
573 	switch (ipid->ipid_addr_length) {
574 	case IP_ADDR_LEN:
575 		/* addr_ucp points at IP addr */
576 		break;
577 	case sizeof (sin_t): {
578 		sin_t	*sin;
579 		/*
580 		 * got complete (sockaddr) address - increment addr_ucp to point
581 		 * at the ip_addr field.
582 		 */
583 		sin = (sin_t *)addr_ucp;
584 		addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
585 		break;
586 	}
587 	default:
588 		return (EINVAL);
589 	}
590 	/* Extract the destination address. */
591 	bcopy(addr_ucp, &addr, IP_ADDR_LEN);
592 
593 	/* Try to find the CACHED IRE. */
594 	ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
595 
596 	/* Nail it. */
597 	if (ire) {
598 		/* Allow delete only on CACHE entries */
599 		if (ire->ire_type != IRE_CACHE) {
600 			ire_refrele(ire);
601 			return (EINVAL);
602 		}
603 
604 		/*
605 		 * Verify that the IRE has been around for a while.
606 		 * This is to protect against transport protocols
607 		 * that are too eager in sending delete messages.
608 		 */
609 		if (gethrestime_sec() <
610 		    ire->ire_create_time + ipst->ips_ip_ignore_delete_time) {
611 			ire_refrele(ire);
612 			return (EINVAL);
613 		}
614 		/*
615 		 * Now we have a potentially dead cache entry. We need
616 		 * to remove it.
617 		 * If this cache entry is generated from a
618 		 * default route (i.e., ire_cmask == 0),
619 		 * search the default list and mark it dead and some
620 		 * background process will try to activate it.
621 		 */
622 		if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) {
623 			/*
624 			 * Make sure that we pick a different
625 			 * IRE_DEFAULT next time.
626 			 */
627 			ire_t *gw_ire;
628 			irb_t *irb = NULL;
629 			uint_t match_flags;
630 
631 			match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE);
632 
633 			gire = ire_ftable_lookup(ire->ire_addr,
634 			    ire->ire_cmask, 0, 0,
635 			    ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags,
636 			    ipst);
637 
638 			ip3dbg(("ire_ftable_lookup() returned gire %p\n",
639 			    (void *)gire));
640 
641 			if (gire != NULL) {
642 				irb = gire->ire_bucket;
643 
644 				/*
645 				 * We grab it as writer just to serialize
646 				 * multiple threads trying to bump up
647 				 * irb_rr_origin
648 				 */
649 				rw_enter(&irb->irb_lock, RW_WRITER);
650 				if ((gw_ire = irb->irb_rr_origin) == NULL) {
651 					rw_exit(&irb->irb_lock);
652 					goto done;
653 				}
654 
655 				DTRACE_PROBE1(ip__ire__del__origin,
656 				    (ire_t *), gw_ire);
657 
658 				/* Skip past the potentially bad gateway */
659 				if (ire->ire_gateway_addr ==
660 				    gw_ire->ire_gateway_addr) {
661 					ire_t *next = gw_ire->ire_next;
662 
663 					DTRACE_PROBE2(ip__ire__del,
664 					    (ire_t *), gw_ire, (irb_t *), irb);
665 					IRE_FIND_NEXT_ORIGIN(next);
666 					irb->irb_rr_origin = next;
667 				}
668 				rw_exit(&irb->irb_lock);
669 			}
670 		}
671 done:
672 		if (gire != NULL)
673 			IRE_REFRELE(gire);
674 		/* report the bad route to routing sockets */
675 		ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr,
676 		    ire->ire_mask, ire->ire_src_addr, 0, 0, 0,
677 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst);
678 		routing_sock_info = B_TRUE;
679 
680 		/*
681 		 * TCP is really telling us to start over completely, and it
682 		 * expects that we'll resend the ARP query.  Tell ARP to
683 		 * discard the entry, if this is a local destination.
684 		 *
685 		 * But, if the ARP entry is permanent then it shouldn't be
686 		 * deleted, so we set ARED_F_PRESERVE_PERM.
687 		 */
688 		ill = ire->ire_stq->q_ptr;
689 		if (ire->ire_gateway_addr == 0 &&
690 		    (arp_mp = ill_ared_alloc(ill, addr)) != NULL) {
691 			ared_t *ared = (ared_t *)arp_mp->b_rptr;
692 
693 			ASSERT(ared->ared_cmd == AR_ENTRY_DELETE);
694 			ared->ared_flags |= ARED_F_PRESERVE_PERM;
695 			putnext(ill->ill_rq, arp_mp);
696 		}
697 
698 		ire_delete(ire);
699 		ire_refrele(ire);
700 	}
701 	/*
702 	 * Also look for an IRE_HOST type redirect ire and
703 	 * remove it if present.
704 	 */
705 	ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL,
706 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
707 
708 	/* Nail it. */
709 	if (ire != NULL) {
710 		if (ire->ire_flags & RTF_DYNAMIC) {
711 			if (!routing_sock_info) {
712 				ip_rts_change(RTM_LOSING, ire->ire_addr,
713 				    ire->ire_gateway_addr, ire->ire_mask,
714 				    ire->ire_src_addr, 0, 0, 0,
715 				    (RTA_DST | RTA_GATEWAY |
716 				    RTA_NETMASK | RTA_IFA),
717 				    ipst);
718 			}
719 			ire_delete(ire);
720 		}
721 		ire_refrele(ire);
722 	}
723 	return (0);
724 }
725 
726 /*
727  * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed
728  * down from the Upper Level Protocol to request a copy of the IRE (to check
729  * its type or to extract information like round-trip time estimates or the
730  * MTU.)
731  * The address is assumed to be in the ire_addr field. If no IRE is found
732  * an IRE is returned with ire_type being zero.
733  * Note that the upper lavel protocol has to check for broadcast
734  * (IRE_BROADCAST) and multicast (CLASSD(addr)).
735  * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the
736  * end of the returned message.
737  *
738  * TCP sends down a message of this type with a connection request packet
739  * chained on. UDP and ICMP send it down to verify that a route exists for
740  * the destination address when they get connected.
741  */
742 void
743 ip_ire_req(queue_t *q, mblk_t *mp)
744 {
745 	ire_t	*inire;
746 	ire_t	*ire;
747 	mblk_t	*mp1;
748 	ire_t	*sire = NULL;
749 	zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
750 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
751 
752 	ASSERT(q->q_next == NULL);
753 
754 	if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) ||
755 	    !OK_32PTR(mp->b_rptr)) {
756 		freemsg(mp);
757 		return;
758 	}
759 	inire = (ire_t *)mp->b_rptr;
760 	/*
761 	 * Got it, now take our best shot at an IRE.
762 	 */
763 	if (inire->ire_ipversion == IPV6_VERSION) {
764 		ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0,
765 		    NULL, &sire, zoneid, NULL,
766 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
767 	} else {
768 		ASSERT(inire->ire_ipversion == IPV4_VERSION);
769 		ire = ire_route_lookup(inire->ire_addr, 0, 0, 0,
770 		    NULL, &sire, zoneid, NULL,
771 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
772 	}
773 
774 	/*
775 	 * We prevent returning IRES with source address INADDR_ANY
776 	 * as these were temporarily created for sending packets
777 	 * from endpoints that have conn_unspec_src set.
778 	 */
779 	if (ire == NULL ||
780 	    (ire->ire_ipversion == IPV4_VERSION &&
781 	    ire->ire_src_addr == INADDR_ANY) ||
782 	    (ire->ire_ipversion == IPV6_VERSION &&
783 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) {
784 		inire->ire_type = 0;
785 	} else {
786 		bcopy(ire, inire, sizeof (ire_t));
787 		/* Copy the route metrics from the parent. */
788 		if (sire != NULL) {
789 			bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo),
790 			    sizeof (iulp_t));
791 		}
792 
793 		/*
794 		 * As we don't lookup global policy here, we may not
795 		 * pass the right size if per-socket policy is not
796 		 * present. For these cases, path mtu discovery will
797 		 * do the right thing.
798 		 */
799 		inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q));
800 
801 		/* Pass the latest setting of the ip_path_mtu_discovery */
802 		inire->ire_frag_flag |=
803 		    (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
804 	}
805 	if (ire != NULL)
806 		ire_refrele(ire);
807 	if (sire != NULL)
808 		ire_refrele(sire);
809 	mp->b_wptr = &mp->b_rptr[sizeof (ire_t)];
810 	mp->b_datap->db_type = IRE_DB_TYPE;
811 
812 	/* Put the IRE_DB_TYPE mblk last in the chain */
813 	mp1 = mp->b_cont;
814 	if (mp1 != NULL) {
815 		mp->b_cont = NULL;
816 		linkb(mp1, mp);
817 		mp = mp1;
818 	}
819 	qreply(q, mp);
820 }
821 
822 /*
823  * Send a packet using the specified IRE.
824  * If ire_src_addr_v6 is all zero then discard the IRE after
825  * the packet has been sent.
826  */
827 static void
828 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire)
829 {
830 	mblk_t *ipsec_mp;
831 	boolean_t is_secure;
832 	uint_t ifindex;
833 	ill_t	*ill;
834 	zoneid_t zoneid = ire->ire_zoneid;
835 	ip_stack_t	*ipst = ire->ire_ipst;
836 
837 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
838 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
839 	ipsec_mp = pkt;
840 	is_secure = (pkt->b_datap->db_type == M_CTL);
841 	if (is_secure) {
842 		ipsec_out_t *io;
843 
844 		pkt = pkt->b_cont;
845 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
846 		if (io->ipsec_out_type == IPSEC_OUT)
847 			zoneid = io->ipsec_out_zoneid;
848 	}
849 
850 	/* If the packet originated externally then */
851 	if (pkt->b_prev) {
852 		ire_refrele(ire);
853 		/*
854 		 * Extract the ifindex from b_prev (set in ip_rput_noire).
855 		 * Look up interface to see if it still exists (it could have
856 		 * been unplumbed by the time the reply came back from ARP)
857 		 */
858 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
859 		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
860 		    NULL, NULL, NULL, NULL, ipst);
861 		if (ill == NULL) {
862 			pkt->b_prev = NULL;
863 			pkt->b_next = NULL;
864 			freemsg(ipsec_mp);
865 			return;
866 		}
867 		q = ill->ill_rq;
868 		pkt->b_prev = NULL;
869 		/*
870 		 * This packet has not gone through IPSEC processing
871 		 * and hence we should not have any IPSEC message
872 		 * prepended.
873 		 */
874 		ASSERT(ipsec_mp == pkt);
875 		put(q, pkt);
876 		ill_refrele(ill);
877 	} else if (pkt->b_next) {
878 		/* Packets from multicast router */
879 		pkt->b_next = NULL;
880 		/*
881 		 * We never get the IPSEC_OUT while forwarding the
882 		 * packet for multicast router.
883 		 */
884 		ASSERT(ipsec_mp == pkt);
885 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL);
886 		ire_refrele(ire);
887 	} else {
888 		/* Locally originated packets */
889 		boolean_t delete_ire = B_FALSE;
890 		ipha_t *ipha = (ipha_t *)pkt->b_rptr;
891 
892 		/*
893 		 * If this IRE shouldn't be kept in the table (because its
894 		 * source address is unspecified), hold a reference to it so
895 		 * we can delete it even after e.g. ip_wput_ire() has dropped
896 		 * its reference.
897 		 */
898 		if (!(ire->ire_marks & IRE_MARK_NOADD) &&
899 		    ire->ire_src_addr == INADDR_ANY) {
900 			delete_ire = B_TRUE;
901 			IRE_REFHOLD(ire);
902 		}
903 
904 		/*
905 		 * If we were resolving a router we can not use the
906 		 * routers IRE for sending the packet (since it would
907 		 * violate the uniqness of the IP idents) thus we
908 		 * make another pass through ip_wput to create the IRE_CACHE
909 		 * for the destination.
910 		 * When IRE_MARK_NOADD is set, ire_add() is not called.
911 		 * Thus ip_wput() will never find a ire and result in an
912 		 * infinite loop. Thus we check whether IRE_MARK_NOADD is
913 		 * is set. This also implies that IRE_MARK_NOADD can only be
914 		 * used to send packets to directly connected hosts.
915 		 */
916 		if (ipha->ipha_dst != ire->ire_addr &&
917 		    !(ire->ire_marks & IRE_MARK_NOADD)) {
918 			ire_refrele(ire);	/* Held in ire_add */
919 			if (CONN_Q(q)) {
920 				(void) ip_output(Q_TO_CONN(q), ipsec_mp, q,
921 				    IRE_SEND);
922 			} else {
923 				(void) ip_output((void *)(uintptr_t)zoneid,
924 				    ipsec_mp, q, IRE_SEND);
925 			}
926 		} else {
927 			if (is_secure) {
928 				ipsec_out_t *oi;
929 				ipha_t *ipha;
930 
931 				oi = (ipsec_out_t *)ipsec_mp->b_rptr;
932 				ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
933 				if (oi->ipsec_out_proc_begin) {
934 					/*
935 					 * This is the case where
936 					 * ip_wput_ipsec_out could not find
937 					 * the IRE and recreated a new one.
938 					 * As ip_wput_ipsec_out does ire
939 					 * lookups, ire_refrele for the extra
940 					 * bump in ire_add.
941 					 */
942 					ire_refrele(ire);
943 					ip_wput_ipsec_out(q, ipsec_mp, ipha,
944 					    NULL, NULL);
945 				} else {
946 					/*
947 					 * IRE_REFRELE will be done in
948 					 * ip_wput_ire.
949 					 */
950 					ip_wput_ire(q, ipsec_mp, ire, NULL,
951 					    IRE_SEND, zoneid);
952 				}
953 			} else {
954 				/*
955 				 * IRE_REFRELE will be done in ip_wput_ire.
956 				 */
957 				ip_wput_ire(q, ipsec_mp, ire, NULL,
958 				    IRE_SEND, zoneid);
959 			}
960 		}
961 		/*
962 		 * Special code to support sending a single packet with
963 		 * conn_unspec_src using an IRE which has no source address.
964 		 * The IRE is deleted here after sending the packet to avoid
965 		 * having other code trip on it. But before we delete the
966 		 * ire, somebody could have looked up this ire.
967 		 * We prevent returning/using this IRE by the upper layers
968 		 * by making checks to NULL source address in other places
969 		 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected.
970 		 * Though this does not completely prevent other threads
971 		 * from using this ire, this should not cause any problems.
972 		 */
973 		if (delete_ire) {
974 			ip1dbg(("ire_send: delete IRE\n"));
975 			ire_delete(ire);
976 			ire_refrele(ire);	/* Held above */
977 		}
978 	}
979 }
980 
981 /*
982  * Send a packet using the specified IRE.
983  * If ire_src_addr_v6 is all zero then discard the IRE after
984  * the packet has been sent.
985  */
986 static void
987 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire)
988 {
989 	mblk_t *ipsec_mp;
990 	boolean_t secure;
991 	uint_t ifindex;
992 	zoneid_t zoneid = ire->ire_zoneid;
993 	ip_stack_t	*ipst = ire->ire_ipst;
994 
995 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
996 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
997 	if (pkt->b_datap->db_type == M_CTL) {
998 		ipsec_out_t *io;
999 
1000 		ipsec_mp = pkt;
1001 		pkt = pkt->b_cont;
1002 		secure = B_TRUE;
1003 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
1004 		if (io->ipsec_out_type == IPSEC_OUT)
1005 			zoneid = io->ipsec_out_zoneid;
1006 	} else {
1007 		ipsec_mp = pkt;
1008 		secure = B_FALSE;
1009 	}
1010 
1011 	/* If the packet originated externally then */
1012 	if (pkt->b_prev) {
1013 		ill_t	*ill;
1014 		/*
1015 		 * Extract the ifindex from b_prev (set in ip_rput_data_v6).
1016 		 * Look up interface to see if it still exists (it could have
1017 		 * been unplumbed by the time the reply came back from the
1018 		 * resolver).
1019 		 */
1020 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
1021 		ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
1022 		    NULL, NULL, NULL, NULL, ipst);
1023 		if (ill == NULL) {
1024 			pkt->b_prev = NULL;
1025 			pkt->b_next = NULL;
1026 			freemsg(ipsec_mp);
1027 			ire_refrele(ire);	/* Held in ire_add */
1028 			return;
1029 		}
1030 		q = ill->ill_rq;
1031 		pkt->b_prev = NULL;
1032 		/*
1033 		 * This packet has not gone through IPSEC processing
1034 		 * and hence we should not have any IPSEC message
1035 		 * prepended.
1036 		 */
1037 		ASSERT(ipsec_mp == pkt);
1038 		put(q, pkt);
1039 		ill_refrele(ill);
1040 	} else if (pkt->b_next) {
1041 		/* Packets from multicast router */
1042 		pkt->b_next = NULL;
1043 		/*
1044 		 * We never get the IPSEC_OUT while forwarding the
1045 		 * packet for multicast router.
1046 		 */
1047 		ASSERT(ipsec_mp == pkt);
1048 		/*
1049 		 * XXX TODO IPv6.
1050 		 */
1051 		freemsg(pkt);
1052 #ifdef XXX
1053 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL);
1054 #endif
1055 	} else {
1056 		if (secure) {
1057 			ipsec_out_t *oi;
1058 			ip6_t *ip6h;
1059 
1060 			oi = (ipsec_out_t *)ipsec_mp->b_rptr;
1061 			ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr;
1062 			if (oi->ipsec_out_proc_begin) {
1063 				/*
1064 				 * This is the case where
1065 				 * ip_wput_ipsec_out could not find
1066 				 * the IRE and recreated a new one.
1067 				 */
1068 				ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h,
1069 				    NULL, NULL);
1070 			} else {
1071 				if (CONN_Q(q)) {
1072 					(void) ip_output_v6(Q_TO_CONN(q),
1073 					    ipsec_mp, q, IRE_SEND);
1074 				} else {
1075 					(void) ip_output_v6(
1076 					    (void *)(uintptr_t)zoneid,
1077 					    ipsec_mp, q, IRE_SEND);
1078 				}
1079 			}
1080 		} else {
1081 			/*
1082 			 * Send packets through ip_output_v6 so that any
1083 			 * ip6_info header can be processed again.
1084 			 */
1085 			if (CONN_Q(q)) {
1086 				(void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q,
1087 				    IRE_SEND);
1088 			} else {
1089 				(void) ip_output_v6((void *)(uintptr_t)zoneid,
1090 				    ipsec_mp, q, IRE_SEND);
1091 			}
1092 		}
1093 		/*
1094 		 * Special code to support sending a single packet with
1095 		 * conn_unspec_src using an IRE which has no source address.
1096 		 * The IRE is deleted here after sending the packet to avoid
1097 		 * having other code trip on it. But before we delete the
1098 		 * ire, somebody could have looked up this ire.
1099 		 * We prevent returning/using this IRE by the upper layers
1100 		 * by making checks to NULL source address in other places
1101 		 * like e.g ip_ire_append_v6, ip_ire_req and
1102 		 * ip_bind_connected_v6. Though, this does not completely
1103 		 * prevent other threads from using this ire, this should
1104 		 * not cause any problems.
1105 		 */
1106 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
1107 			ip1dbg(("ire_send_v6: delete IRE\n"));
1108 			ire_delete(ire);
1109 		}
1110 	}
1111 	ire_refrele(ire);	/* Held in ire_add */
1112 }
1113 
1114 /*
1115  * Make sure that IRE bucket does not get too long.
1116  * This can cause lock up because ire_cache_lookup()
1117  * may take "forever" to finish.
1118  *
1119  * We only remove a maximum of cnt IREs each time.  This
1120  * should keep the bucket length approximately constant,
1121  * depending on cnt.  This should be enough to defend
1122  * against DoS attack based on creating temporary IREs
1123  * (for forwarding and non-TCP traffic).
1124  *
1125  * We also pass in the address of the newly created IRE
1126  * as we do not want to remove this straight after adding
1127  * it. New IREs are normally added at the tail of the
1128  * bucket.  This means that we are removing the "oldest"
1129  * temporary IREs added.  Only if there are IREs with
1130  * the same ire_addr, do we not add it at the tail.  Refer
1131  * to ire_add_v*().  It should be OK for our purpose.
1132  *
1133  * For non-temporary cached IREs, we make sure that they
1134  * have not been used for some time (defined below), they
1135  * are non-local destinations, and there is no one using
1136  * them at the moment (refcnt == 1).
1137  *
1138  * The above means that the IRE bucket length may become
1139  * very long, consisting of mostly non-temporary IREs.
1140  * This can happen when the hash function does a bad job
1141  * so that most TCP connections cluster to a specific bucket.
1142  * This "hopefully" should never happen.  It can also
1143  * happen if most TCP connections have very long lives.
1144  * Even with the minimal hash table size of 256, there
1145  * has to be a lot of such connections to make the bucket
1146  * length unreasonably long.  This should probably not
1147  * happen either.  The third can when this can happen is
1148  * when the machine is under attack, such as SYN flooding.
1149  * TCP should already have the proper mechanism to protect
1150  * that.  So we should be safe.
1151  *
1152  * This function is called by ire_add_then_send() after
1153  * a new IRE is added and the packet is sent.
1154  *
1155  * The idle cutoff interval is set to 60s.  It can be
1156  * changed using /etc/system.
1157  */
1158 uint32_t ire_idle_cutoff_interval = 60000;
1159 
1160 static void
1161 ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire)
1162 {
1163 	ire_t *ire;
1164 	clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000);
1165 	int cnt = ip_ire_cleanup_cnt;
1166 
1167 	/*
1168 	 * Try to remove cnt temporary IREs first.
1169 	 */
1170 	for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) {
1171 		if (ire == ref_ire)
1172 			continue;
1173 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
1174 			continue;
1175 		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
1176 			ASSERT(ire->ire_type == IRE_CACHE);
1177 			ire_delete(ire);
1178 			cnt--;
1179 		}
1180 	}
1181 	if (cnt == 0)
1182 		return;
1183 
1184 	/*
1185 	 * If we didn't satisfy our removal target from temporary IREs
1186 	 * we see how many non-temporary IREs are currently in the bucket.
1187 	 * If this quantity is above the threshold then we see if there are any
1188 	 * candidates for removal. We are still limited to removing a maximum
1189 	 * of cnt IREs.
1190 	 */
1191 	if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) {
1192 		for (ire = irb->irb_ire; cnt > 0 && ire != NULL;
1193 		    ire = ire->ire_next) {
1194 			if (ire == ref_ire)
1195 				continue;
1196 			if (ire->ire_type != IRE_CACHE)
1197 				continue;
1198 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
1199 				continue;
1200 			if ((ire->ire_refcnt == 1) &&
1201 			    (lbolt - ire->ire_last_used_time > cut_off)) {
1202 				ire_delete(ire);
1203 				cnt--;
1204 			}
1205 		}
1206 	}
1207 }
1208 
1209 /*
1210  * ire_add_then_send is called when a new IRE has been created in order to
1211  * route an outgoing packet.  Typically, it is called from ip_wput when
1212  * a response comes back down from a resolver.  We add the IRE, and then
1213  * possibly run the packet through ip_wput or ip_rput, as appropriate.
1214  * However, we do not add the newly created IRE in the cache when
1215  * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at
1216  * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by
1217  * ip_wput_ire() and get deleted.
1218  * Multirouting support: the packet is silently discarded when the new IRE
1219  * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the
1220  * RTF_MULTIRT flag for the same destination address.
1221  * In this case, we just want to register this additional ire without
1222  * sending the packet, as it has already been replicated through
1223  * existing multirt routes in ip_wput().
1224  */
1225 void
1226 ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
1227 {
1228 	irb_t *irb;
1229 	boolean_t drop = B_FALSE;
1230 	boolean_t mctl_present;
1231 	mblk_t *first_mp = NULL;
1232 	mblk_t *data_mp = NULL;
1233 	ire_t *dst_ire;
1234 	ipha_t *ipha;
1235 	ip6_t *ip6h;
1236 	ip_stack_t	*ipst = ire->ire_ipst;
1237 	int		ire_limit;
1238 
1239 	if (mp != NULL) {
1240 		/*
1241 		 * We first have to retrieve the destination address carried
1242 		 * by the packet.
1243 		 * We can't rely on ire as it can be related to a gateway.
1244 		 * The destination address will help in determining if
1245 		 * other RTF_MULTIRT ires are already registered.
1246 		 *
1247 		 * We first need to know where we are going : v4 or V6.
1248 		 * the ire version is enough, as there is no risk that
1249 		 * we resolve an IPv6 address with an IPv4 ire
1250 		 * or vice versa.
1251 		 */
1252 		EXTRACT_PKT_MP(mp, first_mp, mctl_present);
1253 		data_mp = mp;
1254 		mp = first_mp;
1255 		if (ire->ire_ipversion == IPV4_VERSION) {
1256 			ipha = (ipha_t *)data_mp->b_rptr;
1257 			dst_ire = ire_cache_lookup(ipha->ipha_dst,
1258 			    ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
1259 		} else {
1260 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
1261 			ip6h = (ip6_t *)data_mp->b_rptr;
1262 			dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
1263 			    ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
1264 		}
1265 		if (dst_ire != NULL) {
1266 			if (dst_ire->ire_flags & RTF_MULTIRT) {
1267 				/*
1268 				 * At least one resolved multirt route
1269 				 * already exists for the destination,
1270 				 * don't sent this packet: either drop it
1271 				 * or complete the pending resolution,
1272 				 * depending on the ire.
1273 				 */
1274 				drop = B_TRUE;
1275 			}
1276 			ip1dbg(("ire_add_then_send: dst_ire %p "
1277 			    "[dst %08x, gw %08x], drop %d\n",
1278 			    (void *)dst_ire,
1279 			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
1280 			    ntohl(dst_ire->ire_addr) : \
1281 			    ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)),
1282 			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
1283 			    ntohl(dst_ire->ire_gateway_addr) : \
1284 			    ntohl(V4_PART_OF_V6(
1285 			    dst_ire->ire_gateway_addr_v6)),
1286 			    drop));
1287 			ire_refrele(dst_ire);
1288 		}
1289 	}
1290 
1291 	if (!(ire->ire_marks & IRE_MARK_NOADD)) {
1292 		/* Regular packets with cache bound ires are here. */
1293 		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
1294 
1295 		if (ire == NULL) {
1296 			mp->b_prev = NULL;
1297 			mp->b_next = NULL;
1298 			MULTIRT_DEBUG_UNTAG(mp);
1299 			freemsg(mp);
1300 			return;
1301 		}
1302 		if (mp == NULL) {
1303 			ire_refrele(ire);	/* Held in ire_add_v4/v6 */
1304 			return;
1305 		}
1306 	}
1307 	if (drop) {
1308 		/*
1309 		 * If we're adding an RTF_MULTIRT ire, the resolution
1310 		 * is over: we just drop the packet.
1311 		 */
1312 		if (ire->ire_flags & RTF_MULTIRT) {
1313 			data_mp->b_prev = NULL;
1314 			data_mp->b_next = NULL;
1315 			MULTIRT_DEBUG_UNTAG(mp);
1316 			freemsg(mp);
1317 		} else {
1318 			/*
1319 			 * Otherwise, we're adding the ire to a gateway
1320 			 * for a multirt route.
1321 			 * Invoke ip_newroute() to complete the resolution
1322 			 * of the route. We will then come back here and
1323 			 * finally drop this packet in the above code.
1324 			 */
1325 			if (ire->ire_ipversion == IPV4_VERSION) {
1326 				/*
1327 				 * TODO: in order for CGTP to work in non-global
1328 				 * zones, ip_newroute() must create the IRE
1329 				 * cache in the zone indicated by
1330 				 * ire->ire_zoneid.
1331 				 */
1332 				ip_newroute(q, mp, ipha->ipha_dst,
1333 				    (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
1334 				    ire->ire_zoneid, ipst);
1335 			} else {
1336 				int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
1337 
1338 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
1339 
1340 				/*
1341 				 * If necessary, skip over the ip6i_t to find
1342 				 * the header with the actual source address.
1343 				 */
1344 				if (ip6h->ip6_nxt == IPPROTO_RAW) {
1345 					if (MBLKL(data_mp) < minlen &&
1346 					    pullupmsg(data_mp, -1) == 0) {
1347 						ip1dbg(("ire_add_then_send: "
1348 						    "cannot pullupmsg ip6i\n"));
1349 						if (mctl_present)
1350 							freeb(first_mp);
1351 						ire_refrele(ire);
1352 						return;
1353 					}
1354 					ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
1355 					ip6h = (ip6_t *)(data_mp->b_rptr +
1356 					    sizeof (ip6i_t));
1357 				}
1358 				ip_newroute_v6(q, mp, &ip6h->ip6_dst,
1359 				    &ip6h->ip6_src, NULL, ire->ire_zoneid,
1360 				    ipst);
1361 			}
1362 		}
1363 
1364 		ire_refrele(ire); /* As done by ire_send(). */
1365 		return;
1366 	}
1367 	/*
1368 	 * Need to remember ire_bucket here as ire_send*() may delete
1369 	 * the ire so we cannot reference it after that.
1370 	 */
1371 	irb = ire->ire_bucket;
1372 	if (ire->ire_ipversion == IPV4_VERSION) {
1373 		ire_send(q, mp, ire);
1374 		ire_limit = ip_ire_max_bucket_cnt;
1375 	} else {
1376 		ire_send_v6(q, mp, ire);
1377 		ire_limit = ip6_ire_max_bucket_cnt;
1378 	}
1379 
1380 	/*
1381 	 * irb is NULL if the IRE was not added to the hash. This happens
1382 	 * when IRE_MARK_NOADD is set and when IREs are returned from
1383 	 * ire_update_srcif_v4().
1384 	 */
1385 	if (irb != NULL) {
1386 		IRB_REFHOLD(irb);
1387 		if (irb->irb_ire_cnt > ire_limit)
1388 			ire_cache_cleanup(irb, ire_limit, ire);
1389 		IRB_REFRELE(irb);
1390 	}
1391 }
1392 
1393 /*
1394  * Initialize the ire that is specific to IPv4 part and call
1395  * ire_init_common to finish it.
1396  */
1397 ire_t *
1398 ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr,
1399     uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
1400     queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
1401     uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
1402     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
1403 {
1404 	ASSERT(type != IRE_CACHE || stq != NULL);
1405 	/*
1406 	 * Reject IRE security attribute creation/initialization
1407 	 * if system is not running in Trusted mode.
1408 	 */
1409 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
1410 		return (NULL);
1411 
1412 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
1413 
1414 	if (addr != NULL)
1415 		bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
1416 	if (src_addr != NULL)
1417 		bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN);
1418 	if (mask != NULL) {
1419 		bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
1420 		ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
1421 	}
1422 	if (gateway != NULL) {
1423 		bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
1424 	}
1425 
1426 	if (type == IRE_CACHE)
1427 		ire->ire_cmask = cmask;
1428 
1429 	/* ire_init_common will free the mblks upon encountering any failure */
1430 	if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif,
1431 	    phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst))
1432 		return (NULL);
1433 
1434 	return (ire);
1435 }
1436 
1437 /*
1438  * Similar to ire_create except that it is called only when
1439  * we want to allocate ire as an mblk e.g. we have an external
1440  * resolver ARP.
1441  */
1442 ire_t *
1443 ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
1444     uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
1445     ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle,
1446     uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp,
1447     ip_stack_t *ipst)
1448 {
1449 	ire_t	*ire, *buf;
1450 	ire_t	*ret_ire;
1451 	mblk_t	*mp;
1452 	size_t	bufsize;
1453 	frtn_t	*frtnp;
1454 	ill_t	*ill;
1455 
1456 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
1457 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1458 	if (buf == NULL) {
1459 		ip1dbg(("ire_create_mp: alloc failed\n"));
1460 		return (NULL);
1461 	}
1462 	frtnp = (frtn_t *)(buf + 1);
1463 	frtnp->free_arg = (caddr_t)buf;
1464 	frtnp->free_func = ire_freemblk;
1465 
1466 	/*
1467 	 * Allocate the new IRE. The ire created will hold a ref on
1468 	 * an nce_t after ire_nce_init, and this ref must either be
1469 	 * (a)  transferred to the ire_cache entry created when ire_add_v4
1470 	 *	is called after successful arp resolution, or,
1471 	 * (b)  released, when arp resolution fails
1472 	 * Case (b) is handled in ire_freemblk() which will be called
1473 	 * when mp is freed as a result of failed arp.
1474 	 */
1475 	mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
1476 	if (mp == NULL) {
1477 		ip1dbg(("ire_create_mp: alloc failed\n"));
1478 		kmem_free(buf, bufsize);
1479 		return (NULL);
1480 	}
1481 	ire = (ire_t *)mp->b_rptr;
1482 	mp->b_wptr = (uchar_t *)&ire[1];
1483 
1484 	/* Start clean. */
1485 	*ire = ire_null;
1486 	ire->ire_mp = mp;
1487 	mp->b_datap->db_type = IRE_DB_TYPE;
1488 	ire->ire_marks |= IRE_MARK_UNCACHED;
1489 
1490 	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce,
1491 	    rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc,
1492 	    gcgrp, ipst);
1493 
1494 	ill = (ill_t *)(stq->q_ptr);
1495 	if (ret_ire == NULL) {
1496 		/* ire_freemblk needs these set */
1497 		ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
1498 		ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
1499 		ire->ire_ipst = ipst;
1500 		freeb(ire->ire_mp);
1501 		return (NULL);
1502 	}
1503 	ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
1504 	ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
1505 	ASSERT(ret_ire == ire);
1506 	ASSERT(ret_ire->ire_ipst == ipst);
1507 	/*
1508 	 * ire_max_frag is normally zero here and is atomically set
1509 	 * under the irebucket lock in ire_add_v[46] except for the
1510 	 * case of IRE_MARK_NOADD. In that event the the ire_max_frag
1511 	 * is non-zero here.
1512 	 */
1513 	ire->ire_max_frag = max_frag;
1514 	return (ire);
1515 }
1516 
1517 /*
1518  * ire_create is called to allocate and initialize a new IRE.
1519  *
1520  * NOTE : This is called as writer sometimes though not required
1521  * by this function.
1522  */
1523 ire_t *
1524 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
1525     uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
1526     ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
1527     uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
1528     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
1529 {
1530 	ire_t	*ire;
1531 	ire_t	*ret_ire;
1532 
1533 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
1534 	if (ire == NULL) {
1535 		ip1dbg(("ire_create: alloc failed\n"));
1536 		return (NULL);
1537 	}
1538 	*ire = ire_null;
1539 
1540 	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp,
1541 	    src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags,
1542 	    ulp_info, gc, gcgrp, ipst);
1543 
1544 	if (ret_ire == NULL) {
1545 		kmem_cache_free(ire_cache, ire);
1546 		return (NULL);
1547 	}
1548 	ASSERT(ret_ire == ire);
1549 	return (ire);
1550 }
1551 
1552 /*
1553  * Common to IPv4 and IPv6
1554  */
1555 boolean_t
1556 ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
1557     queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle,
1558     uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info,
1559     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
1560 {
1561 	ire->ire_max_fragp = max_fragp;
1562 	ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
1563 
1564 #ifdef DEBUG
1565 	if (ipif != NULL) {
1566 		if (ipif->ipif_isv6)
1567 			ASSERT(ipversion == IPV6_VERSION);
1568 		else
1569 			ASSERT(ipversion == IPV4_VERSION);
1570 	}
1571 #endif /* DEBUG */
1572 
1573 	/*
1574 	 * Create/initialize IRE security attribute only in Trusted mode;
1575 	 * if the passed in gc/gcgrp is non-NULL, we expect that the caller
1576 	 * has held a reference to it and will release it when this routine
1577 	 * returns a failure, otherwise we own the reference.  We do this
1578 	 * prior to initializing the rest IRE fields.
1579 	 *
1580 	 * Don't allocate ire_gw_secattr for the resolver case to prevent
1581 	 * memory leak (in case of external resolution failure). We'll
1582 	 * allocate it after a successful external resolution, in ire_add().
1583 	 * Note that ire->ire_mp != NULL here means this ire is headed
1584 	 * to an external resolver.
1585 	 */
1586 	if (is_system_labeled()) {
1587 		if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
1588 		    IRE_INTERFACE)) != 0) {
1589 			/* release references on behalf of caller */
1590 			if (gc != NULL)
1591 				GC_REFRELE(gc);
1592 			if (gcgrp != NULL)
1593 				GCGRP_REFRELE(gcgrp);
1594 		} else if ((ire->ire_mp == NULL) &&
1595 		    tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) {
1596 			return (B_FALSE);
1597 		}
1598 	}
1599 
1600 	ire->ire_stq = stq;
1601 	ire->ire_rfq = rfq;
1602 	ire->ire_type = type;
1603 	ire->ire_flags = RTF_UP | flags;
1604 	ire->ire_ident = TICK_TO_MSEC(lbolt);
1605 	bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t));
1606 
1607 	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
1608 	ire->ire_last_used_time = lbolt;
1609 	ire->ire_create_time = (uint32_t)gethrestime_sec();
1610 
1611 	/*
1612 	 * If this IRE is an IRE_CACHE, inherit the handles from the
1613 	 * parent IREs. For others in the forwarding table, assign appropriate
1614 	 * new ones.
1615 	 *
1616 	 * The mutex protecting ire_handle is because ire_create is not always
1617 	 * called as a writer.
1618 	 */
1619 	if (ire->ire_type & IRE_OFFSUBNET) {
1620 		mutex_enter(&ipst->ips_ire_handle_lock);
1621 		ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++;
1622 		mutex_exit(&ipst->ips_ire_handle_lock);
1623 	} else if (ire->ire_type & IRE_INTERFACE) {
1624 		mutex_enter(&ipst->ips_ire_handle_lock);
1625 		ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++;
1626 		mutex_exit(&ipst->ips_ire_handle_lock);
1627 	} else if (ire->ire_type == IRE_CACHE) {
1628 		ire->ire_phandle = phandle;
1629 		ire->ire_ihandle = ihandle;
1630 	}
1631 	ire->ire_ipif = ipif;
1632 	if (ipif != NULL) {
1633 		ire->ire_ipif_seqid = ipif->ipif_seqid;
1634 		ire->ire_ipif_ifindex =
1635 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
1636 		ire->ire_zoneid = ipif->ipif_zoneid;
1637 	} else {
1638 		ire->ire_zoneid = GLOBAL_ZONEID;
1639 	}
1640 	ire->ire_ipversion = ipversion;
1641 	mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
1642 	if (ipversion == IPV4_VERSION) {
1643 		/*
1644 		 * IPv6 initializes the ire_nce in ire_add_v6, which expects
1645 		 * to find the ire_nce to be null when it is called.
1646 		 */
1647 		if (ire_nce_init(ire, src_nce) != 0) {
1648 			/* some failure occurred. propagate error back */
1649 			return (B_FALSE);
1650 		}
1651 	}
1652 	ire->ire_refcnt = 1;
1653 	ire->ire_ipst = ipst;	/* No netstack_hold */
1654 	ire->ire_trace_disable = B_FALSE;
1655 
1656 	return (B_TRUE);
1657 }
1658 
1659 /*
1660  * This routine is called repeatedly by ipif_up to create broadcast IREs.
1661  * It is passed a pointer to a slot in an IRE pointer array into which to
1662  * place the pointer to the new IRE, if indeed we create one.  If the
1663  * IRE corresponding to the address passed in would be a duplicate of an
1664  * existing one, we don't create the new one.  irep is incremented before
1665  * return only if we do create a new IRE.  (Always called as writer.)
1666  *
1667  * Note that with the "match_flags" parameter, we can match on either
1668  * a particular logical interface (MATCH_IRE_IPIF) or for all logical
1669  * interfaces for a given physical interface (MATCH_IRE_ILL).  Currently,
1670  * we only create broadcast ire's on a per physical interface basis. If
1671  * someone is going to be mucking with logical interfaces, it is important
1672  * to call "ipif_check_bcast_ires()" to make sure that any change to a
1673  * logical interface will not cause critical broadcast IRE's to be deleted.
1674  */
1675 ire_t **
1676 ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
1677     int match_flags)
1678 {
1679 	ire_t *ire;
1680 	uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
1681 	boolean_t prefer;
1682 	ill_t *ill = ipif->ipif_ill;
1683 	ip_stack_t *ipst = ill->ill_ipst;
1684 
1685 	/*
1686 	 * No broadcast IREs for the LOOPBACK interface
1687 	 * or others such as point to point and IPIF_NOXMIT.
1688 	 */
1689 	if (!(ipif->ipif_flags & IPIF_BROADCAST) ||
1690 	    (ipif->ipif_flags & IPIF_NOXMIT))
1691 		return (irep);
1692 
1693 	/*
1694 	 * If this new IRE would be a duplicate, only prefer it if one of
1695 	 * the following is true:
1696 	 *
1697 	 * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
1698 	 *    set and the new one has all of those clear.
1699 	 *
1700 	 * 2. The existing one corresponds to an underlying ILL in an IPMP
1701 	 *    group and the new one corresponds to an IPMP group interface.
1702 	 */
1703 	if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
1704 	    ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
1705 		prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
1706 		    !(ipif->ipif_flags & check_flags)) ||
1707 		    (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
1708 		if (!prefer) {
1709 			ire_refrele(ire);
1710 			return (irep);
1711 		}
1712 
1713 		/*
1714 		 * Bcast ires exist in pairs. Both have to be deleted,
1715 		 * Since we are exclusive we can make the above assertion.
1716 		 * The 1st has to be refrele'd since it was ctable_lookup'd.
1717 		 */
1718 		ASSERT(IAM_WRITER_IPIF(ipif));
1719 		ASSERT(ire->ire_next->ire_addr == ire->ire_addr);
1720 		ire_delete(ire->ire_next);
1721 		ire_delete(ire);
1722 		ire_refrele(ire);
1723 	}
1724 	return (ire_create_bcast(ipif, addr, irep));
1725 }
1726 
1727 uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
1728 
1729 /*
1730  * This routine is called from ipif_check_bcast_ires and ire_check_bcast.
1731  * It leaves all the verifying and deleting to those routines. So it always
1732  * creates 2 bcast ires and chains them into the ire array passed in.
1733  */
1734 ire_t **
1735 ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
1736 {
1737 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
1738 	ill_t		*ill = ipif->ipif_ill;
1739 
1740 	ASSERT(IAM_WRITER_IPIF(ipif));
1741 
1742 	if (IS_IPMP(ill)) {
1743 		/*
1744 		 * Broadcast IREs for the IPMP meta-interface use the
1745 		 * nominated broadcast interface to send and receive packets.
1746 		 * If there's no nominated interface, send the packets down to
1747 		 * the IPMP stub driver, which will discard them.  If the
1748 		 * nominated broadcast interface changes, ill_refresh_bcast()
1749 		 * will refresh the broadcast IREs.
1750 		 */
1751 		if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
1752 			ill = ipif->ipif_ill;
1753 	}
1754 
1755 	*irep++ = ire_create(
1756 	    (uchar_t *)&addr,			/* dest addr */
1757 	    (uchar_t *)&ip_g_all_ones,		/* mask */
1758 	    (uchar_t *)&ipif->ipif_src_addr,	/* source addr */
1759 	    NULL,				/* no gateway */
1760 	    &ipif->ipif_mtu,			/* max frag */
1761 	    NULL,				/* no src nce */
1762 	    ill->ill_rq,			/* recv-from queue */
1763 	    ill->ill_wq,			/* send-to queue */
1764 	    IRE_BROADCAST,
1765 	    ipif,
1766 	    0,
1767 	    0,
1768 	    0,
1769 	    0,
1770 	    &ire_uinfo_null,
1771 	    NULL,
1772 	    NULL,
1773 	    ipst);
1774 
1775 	*irep++ = ire_create(
1776 	    (uchar_t *)&addr,			/* dest address */
1777 	    (uchar_t *)&ip_g_all_ones,		/* mask */
1778 	    (uchar_t *)&ipif->ipif_src_addr,	/* source address */
1779 	    NULL,				/* no gateway */
1780 	    &ip_loopback_mtu,			/* max frag size */
1781 	    NULL,				/* no src_nce */
1782 	    ill->ill_rq,			/* recv-from queue */
1783 	    NULL,				/* no send-to queue */
1784 	    IRE_BROADCAST,			/* Needed for fanout in wput */
1785 	    ipif,
1786 	    0,
1787 	    0,
1788 	    0,
1789 	    0,
1790 	    &ire_uinfo_null,
1791 	    NULL,
1792 	    NULL,
1793 	    ipst);
1794 
1795 	return (irep);
1796 }
1797 
1798 /*
1799  * ire_walk routine to delete or update any IRE_CACHE that might contain
1800  * stale information.
1801  * The flags state which entries to delete or update.
1802  * Garbage collection is done separately using kmem alloc callbacks to
1803  * ip_trash_ire_reclaim.
1804  * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME
1805  * since other stale information is cleaned up using NUD.
1806  */
1807 void
1808 ire_expire(ire_t *ire, char *arg)
1809 {
1810 	ire_expire_arg_t	*ieap = (ire_expire_arg_t *)(uintptr_t)arg;
1811 	ill_t			*stq_ill;
1812 	int			flush_flags = ieap->iea_flush_flag;
1813 	ip_stack_t		*ipst = ieap->iea_ipst;
1814 
1815 	if ((flush_flags & FLUSH_REDIRECT_TIME) &&
1816 	    (ire->ire_flags & RTF_DYNAMIC)) {
1817 		/* Make sure we delete the corresponding IRE_CACHE */
1818 		ip1dbg(("ire_expire: all redirects\n"));
1819 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
1820 		ire_delete(ire);
1821 		atomic_dec_32(&ipst->ips_ip_redirect_cnt);
1822 		return;
1823 	}
1824 	if (ire->ire_type != IRE_CACHE)
1825 		return;
1826 
1827 	if (flush_flags & FLUSH_ARP_TIME) {
1828 		/*
1829 		 * Remove all IRE_CACHE except IPv4 multicast ires. These
1830 		 * ires will be deleted by ip_trash_ire_reclaim_stack()
1831 		 * when system runs low in memory.
1832 		 * Verify that create time is more than ip_ire_arp_interval
1833 		 * milliseconds ago.
1834 		 */
1835 
1836 		if (!(ire->ire_ipversion == IPV4_VERSION &&
1837 		    CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) {
1838 			ire_delete(ire);
1839 			return;
1840 		}
1841 	}
1842 
1843 	if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) &&
1844 	    (ire->ire_ipif != NULL)) {
1845 		/* Increase pmtu if it is less than the interface mtu */
1846 		mutex_enter(&ire->ire_lock);
1847 		/*
1848 		 * If the ipif is a vni (whose mtu is 0, since it's virtual)
1849 		 * get the mtu from the sending interfaces' ipif
1850 		 */
1851 		if (IS_VNI(ire->ire_ipif->ipif_ill)) {
1852 			stq_ill = ire->ire_stq->q_ptr;
1853 			ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu,
1854 			    IP_MAXPACKET);
1855 		} else {
1856 			ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
1857 			    IP_MAXPACKET);
1858 		}
1859 		ire->ire_frag_flag |= IPH_DF;
1860 		mutex_exit(&ire->ire_lock);
1861 	}
1862 }
1863 
1864 /*
1865  * Return any local address.  We use this to target ourselves
1866  * when the src address was specified as 'default'.
1867  * Preference for IRE_LOCAL entries.
1868  */
1869 ire_t *
1870 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst)
1871 {
1872 	ire_t	*ire;
1873 	irb_t	*irb;
1874 	ire_t	*maybe = NULL;
1875 	int i;
1876 
1877 	for (i = 0; i < ipst->ips_ip_cache_table_size;  i++) {
1878 		irb = &ipst->ips_ip_cache_table[i];
1879 		if (irb->irb_ire == NULL)
1880 			continue;
1881 		rw_enter(&irb->irb_lock, RW_READER);
1882 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1883 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
1884 			    (ire->ire_zoneid != zoneid &&
1885 			    ire->ire_zoneid != ALL_ZONES))
1886 				continue;
1887 			switch (ire->ire_type) {
1888 			case IRE_LOOPBACK:
1889 				if (maybe == NULL) {
1890 					IRE_REFHOLD(ire);
1891 					maybe = ire;
1892 				}
1893 				break;
1894 			case IRE_LOCAL:
1895 				if (maybe != NULL) {
1896 					ire_refrele(maybe);
1897 				}
1898 				IRE_REFHOLD(ire);
1899 				rw_exit(&irb->irb_lock);
1900 				return (ire);
1901 			}
1902 		}
1903 		rw_exit(&irb->irb_lock);
1904 	}
1905 	return (maybe);
1906 }
1907 
1908 /*
1909  * If the specified IRE is associated with a particular ILL, return
1910  * that ILL pointer (May be called as writer.).
1911  *
1912  * NOTE : This is not a generic function that can be used always.
1913  * This function always returns the ill of the outgoing packets
1914  * if this ire is used.
1915  */
1916 ill_t *
1917 ire_to_ill(const ire_t *ire)
1918 {
1919 	ill_t *ill = NULL;
1920 
1921 	/*
1922 	 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained
1923 	 *    the source address from. ire_stq is the one where the
1924 	 *    packets will be sent out on. We return that here.
1925 	 *
1926 	 * 2) IRE_BROADCAST normally has a loopback and a non-loopback
1927 	 *    copy and they always exist next to each other with loopback
1928 	 *    copy being the first one. If we are called on the non-loopback
1929 	 *    copy, return the one pointed by ire_stq. If it was called on
1930 	 *    a loopback copy, we still return the one pointed by the next
1931 	 *    ire's ire_stq pointer i.e the one pointed by the non-loopback
1932 	 *    copy. We don't want use ire_ipif as it might represent the
1933 	 *    source address (if we borrow source addresses for
1934 	 *    IRE_BROADCASTS in the future).
1935 	 *    However if an interface is currently coming up, the above
1936 	 *    condition may not hold during that period since the ires
1937 	 *    are added one at a time. Thus one of the pair could have been
1938 	 *    added and the other not yet added.
1939 	 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill.
1940 	 * 4) For all others return the ones pointed by ire_ipif->ipif_ill.
1941 	 *    That handles IRE_LOOPBACK.
1942 	 */
1943 
1944 	if (ire->ire_type == IRE_CACHE) {
1945 		ill = (ill_t *)ire->ire_stq->q_ptr;
1946 	} else if (ire->ire_type == IRE_BROADCAST) {
1947 		if (ire->ire_stq != NULL) {
1948 			ill = (ill_t *)ire->ire_stq->q_ptr;
1949 		} else {
1950 			ire_t  *ire_next;
1951 
1952 			ire_next = ire->ire_next;
1953 			if (ire_next != NULL &&
1954 			    ire_next->ire_type == IRE_BROADCAST &&
1955 			    ire_next->ire_addr == ire->ire_addr &&
1956 			    ire_next->ire_ipif == ire->ire_ipif) {
1957 				ill = (ill_t *)ire_next->ire_stq->q_ptr;
1958 			}
1959 		}
1960 	} else if (ire->ire_rfq != NULL) {
1961 		ill = ire->ire_rfq->q_ptr;
1962 	} else if (ire->ire_ipif != NULL) {
1963 		ill = ire->ire_ipif->ipif_ill;
1964 	}
1965 	return (ill);
1966 }
1967 
1968 /* Arrange to call the specified function for every IRE in the world. */
1969 void
1970 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst)
1971 {
1972 	ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst);
1973 }
1974 
1975 void
1976 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
1977 {
1978 	ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst);
1979 }
1980 
1981 void
1982 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
1983 {
1984 	ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst);
1985 }
1986 
1987 /*
1988  * Walk a particular version. version == 0 means both v4 and v6.
1989  */
1990 static void
1991 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
1992     ip_stack_t *ipst)
1993 {
1994 	if (vers != IPV6_VERSION) {
1995 		/*
1996 		 * ip_forwarding_table variable doesn't matter for IPv4 since
1997 		 * ire_walk_ill_tables uses ips_ip_ftable for IPv4.
1998 		 */
1999 		ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
2000 		    0, NULL,
2001 		    ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
2002 		    NULL, zoneid, ipst);
2003 	}
2004 	if (vers != IPV4_VERSION) {
2005 		ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
2006 		    ipst->ips_ip6_ftable_hash_size,
2007 		    ipst->ips_ip_forwarding_table_v6,
2008 		    ipst->ips_ip6_cache_table_size,
2009 		    ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst);
2010 	}
2011 }
2012 
2013 /*
2014  * Arrange to call the specified function for every IRE that matches the ill.
2015  */
2016 void
2017 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
2018     ill_t *ill)
2019 {
2020 	uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
2021 
2022 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill);
2023 }
2024 
2025 void
2026 ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
2027     ill_t *ill)
2028 {
2029 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION,
2030 	    ill);
2031 }
2032 
2033 void
2034 ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
2035     ill_t *ill)
2036 {
2037 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION,
2038 	    ill);
2039 }
2040 
2041 /*
2042  * Walk a particular ill and version.
2043  */
2044 static void
2045 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
2046     void *arg, uchar_t vers, ill_t *ill)
2047 {
2048 	ip_stack_t	*ipst = ill->ill_ipst;
2049 
2050 	if (vers == IPV4_VERSION) {
2051 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
2052 		    IP_MASK_TABLE_SIZE, 0,
2053 		    NULL, ipst->ips_ip_cache_table_size,
2054 		    ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst);
2055 	} else if (vers == IPV6_VERSION) {
2056 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
2057 		    IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
2058 		    ipst->ips_ip_forwarding_table_v6,
2059 		    ipst->ips_ip6_cache_table_size,
2060 		    ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst);
2061 	}
2062 }
2063 
2064 boolean_t
2065 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
2066     ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
2067 {
2068 	ill_t *ire_stq_ill = NULL;
2069 	ill_t *ire_ipif_ill = NULL;
2070 
2071 	ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
2072 	/*
2073 	 * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
2074 	 *    ire_ipif.  Only in the case of IRE_CACHEs can ire_stq and
2075 	 *    ire_ipif be pointing to different ills. But we want to keep
2076 	 *    this function generic enough for future use. So, we always
2077 	 *    try to match on both.  The only caller of this function
2078 	 *    ire_walk_ill_tables, will call "func" after we return from
2079 	 *    this function. We expect "func" to do the right filtering
2080 	 *    of ires in this case.
2081 	 */
2082 	if (match_flags & MATCH_IRE_ILL) {
2083 		if (ire->ire_stq != NULL)
2084 			ire_stq_ill = ire->ire_stq->q_ptr;
2085 		if (ire->ire_ipif != NULL)
2086 			ire_ipif_ill = ire->ire_ipif->ipif_ill;
2087 	}
2088 
2089 	if (zoneid != ALL_ZONES) {
2090 		/*
2091 		 * We're walking the IREs for a specific zone. The only relevant
2092 		 * IREs are:
2093 		 * - all IREs with a matching ire_zoneid
2094 		 * - all IRE_OFFSUBNETs as they're shared across all zones
2095 		 * - IRE_INTERFACE IREs for interfaces with a usable source addr
2096 		 *   with a matching zone
2097 		 * - IRE_DEFAULTs with a gateway reachable from the zone
2098 		 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs
2099 		 * using the same rule; but the above rules are consistent with
2100 		 * the behavior of ire_ftable_lookup[_v6]() so that all the
2101 		 * routes that can be matched during lookup are also matched
2102 		 * here.
2103 		 */
2104 		if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) {
2105 			/*
2106 			 * Note, IRE_INTERFACE can have the stq as NULL. For
2107 			 * example, if the default multicast route is tied to
2108 			 * the loopback address.
2109 			 */
2110 			if ((ire->ire_type & IRE_INTERFACE) &&
2111 			    (ire->ire_stq != NULL)) {
2112 				ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
2113 				if (ire->ire_ipversion == IPV4_VERSION) {
2114 					if (!ipif_usesrc_avail(ire_stq_ill,
2115 					    zoneid))
2116 						/* No usable src addr in zone */
2117 						return (B_FALSE);
2118 				} else if (ire_stq_ill->ill_usesrc_ifindex
2119 				    != 0) {
2120 					/*
2121 					 * For IPv6 use ipif_select_source_v6()
2122 					 * so the right scope selection is done
2123 					 */
2124 					ipif_t *src_ipif;
2125 					src_ipif =
2126 					    ipif_select_source_v6(ire_stq_ill,
2127 					    &ire->ire_addr_v6, B_FALSE,
2128 					    IPV6_PREFER_SRC_DEFAULT,
2129 					    zoneid);
2130 					if (src_ipif != NULL) {
2131 						ipif_refrele(src_ipif);
2132 					} else {
2133 						return (B_FALSE);
2134 					}
2135 				} else {
2136 					return (B_FALSE);
2137 				}
2138 
2139 			} else if (!(ire->ire_type & IRE_OFFSUBNET)) {
2140 				return (B_FALSE);
2141 			}
2142 		}
2143 
2144 		/*
2145 		 * Match all default routes from the global zone, irrespective
2146 		 * of reachability. For a non-global zone only match those
2147 		 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid.
2148 		 */
2149 		if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) {
2150 			int ire_match_flags = 0;
2151 			in6_addr_t gw_addr_v6;
2152 			ire_t *rire;
2153 
2154 			ire_match_flags |= MATCH_IRE_TYPE;
2155 			if (ire->ire_ipif != NULL)
2156 				ire_match_flags |= MATCH_IRE_ILL;
2157 
2158 			if (ire->ire_ipversion == IPV4_VERSION) {
2159 				rire = ire_route_lookup(ire->ire_gateway_addr,
2160 				    0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
2161 				    zoneid, NULL, ire_match_flags, ipst);
2162 			} else {
2163 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
2164 				mutex_enter(&ire->ire_lock);
2165 				gw_addr_v6 = ire->ire_gateway_addr_v6;
2166 				mutex_exit(&ire->ire_lock);
2167 				rire = ire_route_lookup_v6(&gw_addr_v6,
2168 				    NULL, NULL, IRE_INTERFACE, ire->ire_ipif,
2169 				    NULL, zoneid, NULL, ire_match_flags, ipst);
2170 			}
2171 			if (rire == NULL) {
2172 				return (B_FALSE);
2173 			}
2174 			ire_refrele(rire);
2175 		}
2176 	}
2177 
2178 	if (((!(match_flags & MATCH_IRE_TYPE)) ||
2179 	    (ire->ire_type & ire_type)) &&
2180 	    ((!(match_flags & MATCH_IRE_ILL)) ||
2181 	    (ire_stq_ill == ill || ire_ipif_ill == ill ||
2182 	    ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
2183 		return (B_TRUE);
2184 	}
2185 	return (B_FALSE);
2186 }
2187 
2188 int
2189 rtfunc(struct radix_node *rn, void *arg)
2190 {
2191 	struct rtfuncarg *rtf = arg;
2192 	struct rt_entry *rt;
2193 	irb_t *irb;
2194 	ire_t *ire;
2195 	boolean_t ret;
2196 
2197 	rt = (struct rt_entry *)rn;
2198 	ASSERT(rt != NULL);
2199 	irb = &rt->rt_irb;
2200 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2201 		if ((rtf->rt_match_flags != 0) ||
2202 		    (rtf->rt_zoneid != ALL_ZONES)) {
2203 			ret = ire_walk_ill_match(rtf->rt_match_flags,
2204 			    rtf->rt_ire_type, ire,
2205 			    rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
2206 		} else
2207 			ret = B_TRUE;
2208 		if (ret)
2209 			(*rtf->rt_func)(ire, rtf->rt_arg);
2210 	}
2211 	return (0);
2212 }
2213 
2214 /*
2215  * Walk the ftable and the ctable entries that match the ill.
2216  */
2217 void
2218 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
2219     void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
2220     size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid,
2221     ip_stack_t *ipst)
2222 {
2223 	irb_t	*irb_ptr;
2224 	irb_t	*irb;
2225 	ire_t	*ire;
2226 	int i, j;
2227 	boolean_t ret;
2228 	struct rtfuncarg rtfarg;
2229 
2230 	ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
2231 	ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
2232 	/*
2233 	 * Optimize by not looking at the forwarding table if there
2234 	 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE
2235 	 * specified in ire_type.
2236 	 */
2237 	if (!(match_flags & MATCH_IRE_TYPE) ||
2238 	    ((ire_type & IRE_FORWARDTABLE) != 0)) {
2239 		/* knobs such that routine is called only for v6 case */
2240 		if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
2241 			for (i = (ftbl_sz - 1);  i >= 0; i--) {
2242 				if ((irb_ptr = ipftbl[i]) == NULL)
2243 					continue;
2244 				for (j = 0; j < htbl_sz; j++) {
2245 					irb = &irb_ptr[j];
2246 					if (irb->irb_ire == NULL)
2247 						continue;
2248 
2249 					IRB_REFHOLD(irb);
2250 					for (ire = irb->irb_ire; ire != NULL;
2251 					    ire = ire->ire_next) {
2252 						if (match_flags == 0 &&
2253 						    zoneid == ALL_ZONES) {
2254 							ret = B_TRUE;
2255 						} else {
2256 							ret =
2257 							    ire_walk_ill_match(
2258 							    match_flags,
2259 							    ire_type, ire, ill,
2260 							    zoneid, ipst);
2261 						}
2262 						if (ret)
2263 							(*func)(ire, arg);
2264 					}
2265 					IRB_REFRELE(irb);
2266 				}
2267 			}
2268 		} else {
2269 			(void) memset(&rtfarg, 0, sizeof (rtfarg));
2270 			rtfarg.rt_func = func;
2271 			rtfarg.rt_arg = arg;
2272 			if (match_flags != 0) {
2273 				rtfarg.rt_match_flags = match_flags;
2274 			}
2275 			rtfarg.rt_ire_type = ire_type;
2276 			rtfarg.rt_ill = ill;
2277 			rtfarg.rt_zoneid = zoneid;
2278 			rtfarg.rt_ipst = ipst;	/* No netstack_hold */
2279 			(void) ipst->ips_ip_ftable->rnh_walktree_mt(
2280 			    ipst->ips_ip_ftable,
2281 			    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
2282 		}
2283 	}
2284 
2285 	/*
2286 	 * Optimize by not looking at the cache table if there
2287 	 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE
2288 	 * specified in ire_type.
2289 	 */
2290 	if (!(match_flags & MATCH_IRE_TYPE) ||
2291 	    ((ire_type & IRE_CACHETABLE) != 0)) {
2292 		for (i = 0; i < ctbl_sz;  i++) {
2293 			irb = &ipctbl[i];
2294 			if (irb->irb_ire == NULL)
2295 				continue;
2296 			IRB_REFHOLD(irb);
2297 			for (ire = irb->irb_ire; ire != NULL;
2298 			    ire = ire->ire_next) {
2299 				if (match_flags == 0 && zoneid == ALL_ZONES) {
2300 					ret = B_TRUE;
2301 				} else {
2302 					ret = ire_walk_ill_match(
2303 					    match_flags, ire_type,
2304 					    ire, ill, zoneid, ipst);
2305 				}
2306 				if (ret)
2307 					(*func)(ire, arg);
2308 			}
2309 			IRB_REFRELE(irb);
2310 		}
2311 	}
2312 }
2313 
2314 /*
2315  * This function takes a mask and returns
2316  * number of bits set in the mask. If no
2317  * bit is set it returns 0.
2318  * Assumes a contiguous mask.
2319  */
2320 int
2321 ip_mask_to_plen(ipaddr_t mask)
2322 {
2323 	return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1));
2324 }
2325 
2326 /*
2327  * Convert length for a mask to the mask.
2328  */
2329 ipaddr_t
2330 ip_plen_to_mask(uint_t masklen)
2331 {
2332 	return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
2333 }
2334 
2335 void
2336 ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
2337 {
2338 	ill_t *stq_ill, *ipif_ill;
2339 	ip_stack_t *ipst = ire->ire_ipst;
2340 
2341 	stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
2342 	ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
2343 	RELEASE_ILL_LOCKS(ipif_ill, stq_ill);
2344 	rw_exit(&irb_ptr->irb_lock);
2345 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
2346 }
2347 
2348 /*
2349  * ire_add_v[46] atomically make sure that the ipif or ill associated
2350  * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING
2351  * before adding the ire to the table. This ensures that we don't create
2352  * new IRE_CACHEs with stale values for parameters that are passed to
2353  * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer
2354  * to the ipif_mtu, and not the value. The actual value is derived from the
2355  * parent ire or ipif under the bucket lock.
2356  */
2357 int
2358 ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
2359     ipsq_func_t func)
2360 {
2361 	ill_t	*stq_ill;
2362 	ill_t	*ipif_ill;
2363 	int	error = 0;
2364 	ill_t	*ill = NULL;
2365 	ip_stack_t	*ipst = ire->ire_ipst;
2366 
2367 	stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
2368 	ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
2369 
2370 	ASSERT((q != NULL && mp != NULL && func != NULL) ||
2371 	    (q == NULL && mp == NULL && func == NULL));
2372 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
2373 	GRAB_CONN_LOCK(q);
2374 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
2375 	GRAB_ILL_LOCKS(ipif_ill, stq_ill);
2376 
2377 	/*
2378 	 * While the IRE is in the process of being added, a user may have
2379 	 * invoked the ifconfig usesrc option on the stq_ill to make it a
2380 	 * usesrc client ILL. Check for this possibility here, if it is true
2381 	 * then we fail adding the IRE_CACHE. Another check is to make sure
2382 	 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc
2383 	 * group. The ill_g_usesrc_lock is released in ire_atomic_end
2384 	 */
2385 	if ((ire->ire_type & IRE_CACHE) &&
2386 	    (ire->ire_marks & IRE_MARK_USESRC_CHECK)) {
2387 		if (stq_ill->ill_usesrc_ifindex != 0) {
2388 			ASSERT(stq_ill->ill_usesrc_grp_next != NULL);
2389 			if ((ipif_ill->ill_phyint->phyint_ifindex !=
2390 			    stq_ill->ill_usesrc_ifindex) ||
2391 			    (ipif_ill->ill_usesrc_grp_next == NULL) ||
2392 			    (ipif_ill->ill_usesrc_ifindex != 0)) {
2393 				error = EINVAL;
2394 				goto done;
2395 			}
2396 		} else if (ipif_ill->ill_usesrc_grp_next != NULL) {
2397 			error = EINVAL;
2398 			goto done;
2399 		}
2400 	}
2401 
2402 	/*
2403 	 * Don't allow IRE's to be created on changing ill's.  Also, since
2404 	 * IPMP flags can be set on an ill without quiescing it, if we're not
2405 	 * a writer on stq_ill, check that the flags still allow IRE creation.
2406 	 */
2407 	if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
2408 		if (stq_ill->ill_state_flags & ILL_CHANGING) {
2409 			ill = stq_ill;
2410 			error = EAGAIN;
2411 		} else if (IS_UNDER_IPMP(stq_ill)) {
2412 			mutex_enter(&stq_ill->ill_phyint->phyint_lock);
2413 			if (!ipmp_ill_is_active(stq_ill) &&
2414 			    !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
2415 				error = EINVAL;
2416 			}
2417 			mutex_exit(&stq_ill->ill_phyint->phyint_lock);
2418 		}
2419 		if (error != 0)
2420 			goto done;
2421 	}
2422 
2423 	if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
2424 	    (ipif_ill->ill_state_flags & ILL_CHANGING)) {
2425 		ill = ipif_ill;
2426 		error = EAGAIN;
2427 		goto done;
2428 	}
2429 
2430 	if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) &&
2431 	    (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) {
2432 		ill = ire->ire_ipif->ipif_ill;
2433 		ASSERT(ill != NULL);
2434 		error = EAGAIN;
2435 		goto done;
2436 	}
2437 
2438 done:
2439 	if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
2440 		ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
2441 		mutex_enter(&ipsq->ipsq_lock);
2442 		mutex_enter(&ipsq->ipsq_xop->ipx_lock);
2443 		ire_atomic_end(irb_ptr, ire);
2444 		ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
2445 		mutex_exit(&ipsq->ipsq_xop->ipx_lock);
2446 		mutex_exit(&ipsq->ipsq_lock);
2447 		error = EINPROGRESS;
2448 	} else if (error != 0) {
2449 		ire_atomic_end(irb_ptr, ire);
2450 	}
2451 
2452 	RELEASE_CONN_LOCK(q);
2453 	return (error);
2454 }
2455 
2456 /*
2457  * Add a fully initialized IRE to an appropriate table based on
2458  * ire_type.
2459  *
2460  * allow_unresolved == B_FALSE indicates a legacy code-path call
2461  * that has prohibited the addition of incomplete ire's. If this
2462  * parameter is set, and we find an nce that is in a state other
2463  * than ND_REACHABLE, we fail the add. Note that nce_state could be
2464  * something other than ND_REACHABLE if the nce had just expired and
2465  * the ire_create preceding the ire_add added a new ND_INITIAL nce.
2466  */
2467 int
2468 ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
2469     boolean_t allow_unresolved)
2470 {
2471 	ire_t	*ire1;
2472 	ill_t	*stq_ill = NULL;
2473 	ill_t	*ill;
2474 	ipif_t	*ipif = NULL;
2475 	ill_walk_context_t ctx;
2476 	ire_t	*ire = *irep;
2477 	int	error;
2478 	boolean_t ire_is_mblk = B_FALSE;
2479 	tsol_gcgrp_t *gcgrp = NULL;
2480 	tsol_gcgrp_addr_t ga;
2481 	ip_stack_t	*ipst = ire->ire_ipst;
2482 
2483 	/* get ready for the day when original ire is not created as mblk */
2484 	if (ire->ire_mp != NULL) {
2485 		ire_is_mblk = B_TRUE;
2486 		/* Copy the ire to a kmem_alloc'ed area */
2487 		ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
2488 		if (ire1 == NULL) {
2489 			ip1dbg(("ire_add: alloc failed\n"));
2490 			ire_delete(ire);
2491 			*irep = NULL;
2492 			return (ENOMEM);
2493 		}
2494 		ire->ire_marks &= ~IRE_MARK_UNCACHED;
2495 		*ire1 = *ire;
2496 		ire1->ire_mp = NULL;
2497 		ire1->ire_stq_ifindex = 0;
2498 		freeb(ire->ire_mp);
2499 		ire = ire1;
2500 	}
2501 	if (ire->ire_stq != NULL)
2502 		stq_ill = ire->ire_stq->q_ptr;
2503 
2504 	if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
2505 	    stq_ill->ill_net_type == IRE_IF_RESOLVER) {
2506 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2507 		ill = ILL_START_WALK_ALL(&ctx, ipst);
2508 		for (; ill != NULL; ill = ill_next(&ctx, ill)) {
2509 			mutex_enter(&ill->ill_lock);
2510 			if (ill->ill_state_flags & ILL_CONDEMNED) {
2511 				mutex_exit(&ill->ill_lock);
2512 				continue;
2513 			}
2514 			/*
2515 			 * We need to make sure that the ipif is a valid one
2516 			 * before adding the IRE_CACHE. This happens only
2517 			 * with IRE_CACHE when there is an external resolver.
2518 			 *
2519 			 * We can unplumb a logical interface while the
2520 			 * packet is waiting in ARP with the IRE. Then,
2521 			 * later on when we feed the IRE back, the ipif
2522 			 * has to be re-checked. This can't happen with
2523 			 * NDP currently, as we never queue the IRE with
2524 			 * the packet. We always try to recreate the IRE
2525 			 * when the resolution is completed. But, we do
2526 			 * it for IPv6 also here so that in future if
2527 			 * we have external resolvers, it will work without
2528 			 * any change.
2529 			 */
2530 			ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid);
2531 			if (ipif != NULL) {
2532 				ipif_refhold_locked(ipif);
2533 				mutex_exit(&ill->ill_lock);
2534 				break;
2535 			}
2536 			mutex_exit(&ill->ill_lock);
2537 		}
2538 		rw_exit(&ipst->ips_ill_g_lock);
2539 		if (ipif == NULL ||
2540 		    (ipif->ipif_isv6 &&
2541 		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
2542 		    !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
2543 		    &ipif->ipif_v6src_addr)) ||
2544 		    (!ipif->ipif_isv6 &&
2545 		    ire->ire_src_addr != ipif->ipif_src_addr) ||
2546 		    ire->ire_zoneid != ipif->ipif_zoneid) {
2547 			if (ipif != NULL)
2548 				ipif_refrele(ipif);
2549 			ire->ire_ipif = NULL;
2550 			ire_delete(ire);
2551 			*irep = NULL;
2552 			return (EINVAL);
2553 		}
2554 
2555 		ASSERT(ill != NULL);
2556 
2557 		/*
2558 		 * Since we didn't attach label security attributes to the
2559 		 * ire for the resolver case, we need to add it now. (only
2560 		 * for v4 resolver and v6 xresolv case).
2561 		 */
2562 		if (is_system_labeled() && ire_is_mblk) {
2563 			if (ire->ire_ipversion == IPV4_VERSION) {
2564 				ga.ga_af = AF_INET;
2565 				IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr !=
2566 				    INADDR_ANY ? ire->ire_gateway_addr :
2567 				    ire->ire_addr, &ga.ga_addr);
2568 			} else {
2569 				ga.ga_af = AF_INET6;
2570 				ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED(
2571 				    &ire->ire_gateway_addr_v6) ?
2572 				    ire->ire_addr_v6 :
2573 				    ire->ire_gateway_addr_v6;
2574 			}
2575 			gcgrp = gcgrp_lookup(&ga, B_FALSE);
2576 			error = tsol_ire_init_gwattr(ire, ire->ire_ipversion,
2577 			    NULL, gcgrp);
2578 			if (error != 0) {
2579 				if (gcgrp != NULL) {
2580 					GCGRP_REFRELE(gcgrp);
2581 					gcgrp = NULL;
2582 				}
2583 				ipif_refrele(ipif);
2584 				ire->ire_ipif = NULL;
2585 				ire_delete(ire);
2586 				*irep = NULL;
2587 				return (error);
2588 			}
2589 		}
2590 	}
2591 
2592 	/*
2593 	 * In case ire was changed
2594 	 */
2595 	*irep = ire;
2596 	if (ire->ire_ipversion == IPV6_VERSION)
2597 		error = ire_add_v6(irep, q, mp, func);
2598 	else
2599 		error = ire_add_v4(irep, q, mp, func, allow_unresolved);
2600 	if (ipif != NULL)
2601 		ipif_refrele(ipif);
2602 	return (error);
2603 }
2604 
2605 /*
2606  * Add an initialized IRE to an appropriate table based on ire_type.
2607  *
2608  * The forward table contains IRE_PREFIX/IRE_HOST and
2609  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
2610  *
2611  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
2612  * and IRE_CACHE.
2613  *
2614  * NOTE : This function is called as writer though not required
2615  * by this function.
2616  */
2617 static int
2618 ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
2619     boolean_t allow_unresolved)
2620 {
2621 	ire_t	*ire1;
2622 	irb_t	*irb_ptr;
2623 	ire_t	**irep;
2624 	int	flags;
2625 	ire_t	*pire = NULL;
2626 	ill_t	*stq_ill;
2627 	ire_t	*ire = *ire_p;
2628 	int	error;
2629 	boolean_t need_refrele = B_FALSE;
2630 	nce_t	*nce;
2631 	ip_stack_t	*ipst = ire->ire_ipst;
2632 	uint_t	marks = 0;
2633 
2634 	/*
2635 	 * IREs with source addresses hosted on interfaces that are under IPMP
2636 	 * should be hidden so that applications don't accidentally end up
2637 	 * sending packets with test addresses as their source addresses, or
2638 	 * sending out interfaces that are e.g. IFF_INACTIVE.  Hide them here.
2639 	 */
2640 	if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
2641 		marks |= IRE_MARK_TESTHIDDEN;
2642 
2643 	if (ire->ire_ipif != NULL)
2644 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
2645 	if (ire->ire_stq != NULL)
2646 		ASSERT(!MUTEX_HELD(
2647 		    &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock));
2648 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
2649 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
2650 
2651 	/* Find the appropriate list head. */
2652 	switch (ire->ire_type) {
2653 	case IRE_HOST:
2654 		ire->ire_mask = IP_HOST_MASK;
2655 		ire->ire_masklen = IP_ABITS;
2656 		ire->ire_marks |= marks;
2657 		if ((ire->ire_flags & RTF_SETSRC) == 0)
2658 			ire->ire_src_addr = 0;
2659 		break;
2660 	case IRE_CACHE:
2661 		ire->ire_mask = IP_HOST_MASK;
2662 		ire->ire_masklen = IP_ABITS;
2663 		ire->ire_marks |= marks;
2664 		break;
2665 	case IRE_BROADCAST:
2666 	case IRE_LOCAL:
2667 	case IRE_LOOPBACK:
2668 		ire->ire_mask = IP_HOST_MASK;
2669 		ire->ire_masklen = IP_ABITS;
2670 		break;
2671 	case IRE_PREFIX:
2672 	case IRE_DEFAULT:
2673 		ire->ire_marks |= marks;
2674 		if ((ire->ire_flags & RTF_SETSRC) == 0)
2675 			ire->ire_src_addr = 0;
2676 		break;
2677 	case IRE_IF_RESOLVER:
2678 	case IRE_IF_NORESOLVER:
2679 		ire->ire_marks |= marks;
2680 		break;
2681 	default:
2682 		ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
2683 		    (void *)ire, ire->ire_type));
2684 		ire_delete(ire);
2685 		*ire_p = NULL;
2686 		return (EINVAL);
2687 	}
2688 
2689 	/* Make sure the address is properly masked. */
2690 	ire->ire_addr &= ire->ire_mask;
2691 
2692 	/*
2693 	 * ip_newroute/ip_newroute_multi are unable to prevent the deletion
2694 	 * of the interface route while adding an IRE_CACHE for an on-link
2695 	 * destination in the IRE_IF_RESOLVER case, since the ire has to
2696 	 * go to ARP and return. We can't do a REFHOLD on the
2697 	 * associated interface ire for fear of ARP freeing the message.
2698 	 * Here we look up the interface ire in the forwarding table and
2699 	 * make sure that the interface route has not been deleted.
2700 	 */
2701 	if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 &&
2702 	    ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) {
2703 
2704 		ASSERT(ire->ire_max_fragp == NULL);
2705 		if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) {
2706 			/*
2707 			 * The ihandle that we used in ip_newroute_multi
2708 			 * comes from the interface route corresponding
2709 			 * to ire_ipif. Lookup here to see if it exists
2710 			 * still.
2711 			 * If the ire has a source address assigned using
2712 			 * RTF_SETSRC, ire_ipif is the logical interface holding
2713 			 * this source address, so we can't use it to check for
2714 			 * the existence of the interface route. Instead we rely
2715 			 * on the brute force ihandle search in
2716 			 * ire_ihandle_lookup_onlink() below.
2717 			 */
2718 			pire = ipif_to_ire(ire->ire_ipif);
2719 			if (pire == NULL) {
2720 				ire_delete(ire);
2721 				*ire_p = NULL;
2722 				return (EINVAL);
2723 			} else if (pire->ire_ihandle != ire->ire_ihandle) {
2724 				ire_refrele(pire);
2725 				ire_delete(ire);
2726 				*ire_p = NULL;
2727 				return (EINVAL);
2728 			}
2729 		} else {
2730 			pire = ire_ihandle_lookup_onlink(ire);
2731 			if (pire == NULL) {
2732 				ire_delete(ire);
2733 				*ire_p = NULL;
2734 				return (EINVAL);
2735 			}
2736 		}
2737 		/* Prevent pire from getting deleted */
2738 		IRB_REFHOLD(pire->ire_bucket);
2739 		/* Has it been removed already ? */
2740 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
2741 			IRB_REFRELE(pire->ire_bucket);
2742 			ire_refrele(pire);
2743 			ire_delete(ire);
2744 			*ire_p = NULL;
2745 			return (EINVAL);
2746 		}
2747 	} else {
2748 		ASSERT(ire->ire_max_fragp != NULL);
2749 	}
2750 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
2751 
2752 	if (ire->ire_ipif != NULL) {
2753 		/*
2754 		 * We use MATCH_IRE_IPIF while adding IRE_CACHES only
2755 		 * for historic reasons and to maintain symmetry with
2756 		 * IPv6 code path. Historically this was used by
2757 		 * multicast code to create multiple IRE_CACHES on
2758 		 * a single ill with different ipifs. This was used
2759 		 * so that multicast packets leaving the node had the
2760 		 * right source address. This is no longer needed as
2761 		 * ip_wput initializes the address correctly.
2762 		 */
2763 		flags |= MATCH_IRE_IPIF;
2764 		/*
2765 		 * If we are creating a hidden IRE, make sure we search for
2766 		 * hidden IREs when searching for duplicates below.
2767 		 * Otherwise, we might find an IRE on some other interface
2768 		 * that's not marked hidden.
2769 		 */
2770 		if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
2771 			flags |= MATCH_IRE_MARK_TESTHIDDEN;
2772 	}
2773 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
2774 		irb_ptr = ire_get_bucket(ire);
2775 		need_refrele = B_TRUE;
2776 		if (irb_ptr == NULL) {
2777 			/*
2778 			 * This assumes that the ire has not added
2779 			 * a reference to the ipif.
2780 			 */
2781 			ire->ire_ipif = NULL;
2782 			ire_delete(ire);
2783 			if (pire != NULL) {
2784 				IRB_REFRELE(pire->ire_bucket);
2785 				ire_refrele(pire);
2786 			}
2787 			*ire_p = NULL;
2788 			return (EINVAL);
2789 		}
2790 	} else {
2791 		irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH(
2792 		    ire->ire_addr, ipst->ips_ip_cache_table_size)]);
2793 	}
2794 
2795 	/*
2796 	 * Start the atomic add of the ire. Grab the ill locks,
2797 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned
2798 	 *
2799 	 * If ipif or ill is changing ire_atomic_start() may queue the
2800 	 * request and return EINPROGRESS.
2801 	 * To avoid lock order problems, get the ndp4->ndp_g_lock.
2802 	 */
2803 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
2804 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
2805 	if (error != 0) {
2806 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
2807 		/*
2808 		 * We don't know whether it is a valid ipif or not.
2809 		 * So, set it to NULL. This assumes that the ire has not added
2810 		 * a reference to the ipif.
2811 		 */
2812 		ire->ire_ipif = NULL;
2813 		ire_delete(ire);
2814 		if (pire != NULL) {
2815 			IRB_REFRELE(pire->ire_bucket);
2816 			ire_refrele(pire);
2817 		}
2818 		*ire_p = NULL;
2819 		if (need_refrele)
2820 			IRB_REFRELE(irb_ptr);
2821 		return (error);
2822 	}
2823 	/*
2824 	 * To avoid creating ires having stale values for the ire_max_frag
2825 	 * we get the latest value atomically here. For more details
2826 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
2827 	 * in ip_rput_dlpi_writer
2828 	 */
2829 	if (ire->ire_max_fragp == NULL) {
2830 		if (CLASSD(ire->ire_addr))
2831 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
2832 		else
2833 			ire->ire_max_frag = pire->ire_max_frag;
2834 	} else {
2835 		uint_t	max_frag;
2836 
2837 		max_frag = *ire->ire_max_fragp;
2838 		ire->ire_max_fragp = NULL;
2839 		ire->ire_max_frag = max_frag;
2840 	}
2841 	/*
2842 	 * Atomically check for duplicate and insert in the table.
2843 	 */
2844 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2845 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
2846 			continue;
2847 		if (ire->ire_ipif != NULL) {
2848 			/*
2849 			 * We do MATCH_IRE_ILL implicitly here for IREs
2850 			 * with a non-null ire_ipif, including IRE_CACHEs.
2851 			 * As ire_ipif and ire_stq could point to two
2852 			 * different ills, we can't pass just ire_ipif to
2853 			 * ire_match_args and get a match on both ills.
2854 			 * This is just needed for duplicate checks here and
2855 			 * so we don't add an extra argument to
2856 			 * ire_match_args for this. Do it locally.
2857 			 *
2858 			 * NOTE : Currently there is no part of the code
2859 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
2860 			 * match for IRE_CACHEs. Thus we don't want to
2861 			 * extend the arguments to ire_match_args.
2862 			 */
2863 			if (ire1->ire_stq != ire->ire_stq)
2864 				continue;
2865 			/*
2866 			 * Multiroute IRE_CACHEs for a given destination can
2867 			 * have the same ire_ipif, typically if their source
2868 			 * address is forced using RTF_SETSRC, and the same
2869 			 * send-to queue. We differentiate them using the parent
2870 			 * handle.
2871 			 */
2872 			if (ire->ire_type == IRE_CACHE &&
2873 			    (ire1->ire_flags & RTF_MULTIRT) &&
2874 			    (ire->ire_flags & RTF_MULTIRT) &&
2875 			    (ire1->ire_phandle != ire->ire_phandle))
2876 				continue;
2877 		}
2878 		if (ire1->ire_zoneid != ire->ire_zoneid)
2879 			continue;
2880 		if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
2881 		    ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif,
2882 		    ire->ire_zoneid, 0, NULL, flags, NULL)) {
2883 			/*
2884 			 * Return the old ire after doing a REFHOLD.
2885 			 * As most of the callers continue to use the IRE
2886 			 * after adding, we return a held ire. This will
2887 			 * avoid a lookup in the caller again. If the callers
2888 			 * don't want to use it, they need to do a REFRELE.
2889 			 */
2890 			ip1dbg(("found dup ire existing %p new %p\n",
2891 			    (void *)ire1, (void *)ire));
2892 			IRE_REFHOLD(ire1);
2893 			ire_atomic_end(irb_ptr, ire);
2894 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
2895 			ire_delete(ire);
2896 			if (pire != NULL) {
2897 				/*
2898 				 * Assert that it is not removed from the
2899 				 * list yet.
2900 				 */
2901 				ASSERT(pire->ire_ptpn != NULL);
2902 				IRB_REFRELE(pire->ire_bucket);
2903 				ire_refrele(pire);
2904 			}
2905 			*ire_p = ire1;
2906 			if (need_refrele)
2907 				IRB_REFRELE(irb_ptr);
2908 			return (0);
2909 		}
2910 	}
2911 
2912 	if (ire->ire_type & IRE_CACHE) {
2913 		ASSERT(ire->ire_stq != NULL);
2914 		nce = ndp_lookup_v4(ire_to_ill(ire),
2915 		    ((ire->ire_gateway_addr != INADDR_ANY) ?
2916 		    &ire->ire_gateway_addr : &ire->ire_addr),
2917 		    B_TRUE);
2918 		if (nce != NULL)
2919 			mutex_enter(&nce->nce_lock);
2920 		/*
2921 		 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE
2922 		 * and the caller has prohibited the addition of incomplete
2923 		 * ire's, we fail the add. Note that nce_state could be
2924 		 * something other than ND_REACHABLE if the nce had
2925 		 * just expired and the ire_create preceding the
2926 		 * ire_add added a new ND_INITIAL nce.
2927 		 */
2928 		if ((nce == NULL) ||
2929 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
2930 		    (!allow_unresolved &&
2931 		    (nce->nce_state != ND_REACHABLE))) {
2932 			if (nce != NULL) {
2933 				DTRACE_PROBE1(ire__bad__nce, nce_t *, nce);
2934 				mutex_exit(&nce->nce_lock);
2935 			}
2936 			ire_atomic_end(irb_ptr, ire);
2937 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
2938 			if (nce != NULL)
2939 				NCE_REFRELE(nce);
2940 			DTRACE_PROBE1(ire__no__nce, ire_t *, ire);
2941 			ire_delete(ire);
2942 			if (pire != NULL) {
2943 				IRB_REFRELE(pire->ire_bucket);
2944 				ire_refrele(pire);
2945 			}
2946 			*ire_p = NULL;
2947 			if (need_refrele)
2948 				IRB_REFRELE(irb_ptr);
2949 			return (EINVAL);
2950 		} else {
2951 			ire->ire_nce = nce;
2952 			mutex_exit(&nce->nce_lock);
2953 			/*
2954 			 * We are associating this nce to the ire, so
2955 			 * change the nce ref taken in ndp_lookup_v4() from
2956 			 * NCE_REFHOLD to NCE_REFHOLD_NOTR
2957 			 */
2958 			NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
2959 		}
2960 	}
2961 	/*
2962 	 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by
2963 	 * grouping identical addresses together on the hash chain.  We do
2964 	 * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
2965 	 * in such groupings only for broadcasts.
2966 	 *
2967 	 * Find the first entry that matches ire_addr. *irep will be null
2968 	 * if no match.
2969 	 *
2970 	 * Note: the loopback and non-loopback broadcast entries for an
2971 	 * interface MUST be added before any MULTIRT entries.
2972 	 */
2973 	irep = (ire_t **)irb_ptr;
2974 	while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr)
2975 		irep = &ire1->ire_next;
2976 	if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
2977 		/*
2978 		 * We found some ire (i.e *irep) with a matching addr. We
2979 		 * want to group ires with same addr.
2980 		 */
2981 		for (;;) {
2982 			ire1 = *irep;
2983 			if ((ire1->ire_next == NULL) ||
2984 			    (ire1->ire_next->ire_addr != ire->ire_addr) ||
2985 			    (ire1->ire_type != IRE_BROADCAST) ||
2986 			    (ire1->ire_flags & RTF_MULTIRT) ||
2987 			    (ire1->ire_ipif->ipif_ill->ill_grp ==
2988 			    ire->ire_ipif->ipif_ill->ill_grp))
2989 				break;
2990 			irep = &ire1->ire_next;
2991 		}
2992 		ASSERT(*irep != NULL);
2993 		/*
2994 		 * The ire will be added before *irep, so
2995 		 * if irep is a MULTIRT ire, just break to
2996 		 * ire insertion code.
2997 		 */
2998 		if (((*irep)->ire_flags & RTF_MULTIRT) != 0)
2999 			goto insert_ire;
3000 
3001 		irep = &((*irep)->ire_next);
3002 
3003 		/*
3004 		 * Either we have hit the end of the list or the address
3005 		 * did not match.
3006 		 */
3007 		while (*irep != NULL) {
3008 			ire1 = *irep;
3009 			if ((ire1->ire_addr != ire->ire_addr) ||
3010 			    (ire1->ire_type != IRE_BROADCAST))
3011 				break;
3012 			if (ire1->ire_ipif == ire->ire_ipif) {
3013 				irep = &ire1->ire_next;
3014 				break;
3015 			}
3016 			irep = &ire1->ire_next;
3017 		}
3018 	} else if (*irep != NULL) {
3019 		/*
3020 		 * Find the last ire which matches ire_addr.
3021 		 * Needed to do tail insertion among entries with the same
3022 		 * ire_addr.
3023 		 */
3024 		while (ire->ire_addr == ire1->ire_addr) {
3025 			irep = &ire1->ire_next;
3026 			ire1 = *irep;
3027 			if (ire1 == NULL)
3028 				break;
3029 		}
3030 	}
3031 
3032 insert_ire:
3033 	/* Insert at *irep */
3034 	ire1 = *irep;
3035 	if (ire1 != NULL)
3036 		ire1->ire_ptpn = &ire->ire_next;
3037 	ire->ire_next = ire1;
3038 	/* Link the new one in. */
3039 	ire->ire_ptpn = irep;
3040 
3041 	/*
3042 	 * ire_walk routines de-reference ire_next without holding
3043 	 * a lock. Before we point to the new ire, we want to make
3044 	 * sure the store that sets the ire_next of the new ire
3045 	 * reaches global visibility, so that ire_walk routines
3046 	 * don't see a truncated list of ires i.e if the ire_next
3047 	 * of the new ire gets set after we do "*irep = ire" due
3048 	 * to re-ordering, the ire_walk thread will see a NULL
3049 	 * once it accesses the ire_next of the new ire.
3050 	 * membar_producer() makes sure that the following store
3051 	 * happens *after* all of the above stores.
3052 	 */
3053 	membar_producer();
3054 	*irep = ire;
3055 	ire->ire_bucket = irb_ptr;
3056 	/*
3057 	 * We return a bumped up IRE above. Keep it symmetrical
3058 	 * so that the callers will always have to release. This
3059 	 * helps the callers of this function because they continue
3060 	 * to use the IRE after adding and hence they don't have to
3061 	 * lookup again after we return the IRE.
3062 	 *
3063 	 * NOTE : We don't have to use atomics as this is appearing
3064 	 * in the list for the first time and no one else can bump
3065 	 * up the reference count on this yet.
3066 	 */
3067 	IRE_REFHOLD_LOCKED(ire);
3068 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
3069 
3070 	irb_ptr->irb_ire_cnt++;
3071 	if (irb_ptr->irb_marks & IRB_MARK_FTABLE)
3072 		irb_ptr->irb_nire++;
3073 
3074 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
3075 		irb_ptr->irb_tmp_ire_cnt++;
3076 
3077 	if (ire->ire_ipif != NULL) {
3078 		DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
3079 		    (char *), "ire", (void *), ire);
3080 		ire->ire_ipif->ipif_ire_cnt++;
3081 		if (ire->ire_stq != NULL) {
3082 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
3083 			DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
3084 			    (char *), "ire", (void *), ire);
3085 			stq_ill->ill_ire_cnt++;
3086 		}
3087 	} else {
3088 		ASSERT(ire->ire_stq == NULL);
3089 	}
3090 
3091 	ire_atomic_end(irb_ptr, ire);
3092 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3093 
3094 	if (pire != NULL) {
3095 		/* Assert that it is not removed from the list yet */
3096 		ASSERT(pire->ire_ptpn != NULL);
3097 		IRB_REFRELE(pire->ire_bucket);
3098 		ire_refrele(pire);
3099 	}
3100 
3101 	if (ire->ire_type != IRE_CACHE) {
3102 		/*
3103 		 * For ire's with host mask see if there is an entry
3104 		 * in the cache. If there is one flush the whole cache as
3105 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
3106 		 * If no entry is found than there is no need to flush the
3107 		 * cache.
3108 		 */
3109 		if (ire->ire_mask == IP_HOST_MASK) {
3110 			ire_t *lire;
3111 			lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE,
3112 			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
3113 			if (lire != NULL) {
3114 				ire_refrele(lire);
3115 				ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
3116 			}
3117 		} else {
3118 			ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
3119 		}
3120 	}
3121 	/*
3122 	 * We had to delay the fast path probe until the ire is inserted
3123 	 * in the list. Otherwise the fast path ack won't find the ire in
3124 	 * the table.
3125 	 */
3126 	if (ire->ire_type == IRE_CACHE ||
3127 	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) {
3128 		ASSERT(ire->ire_nce != NULL);
3129 		if (ire->ire_nce->nce_state == ND_REACHABLE)
3130 			nce_fastpath(ire->ire_nce);
3131 	}
3132 	if (ire->ire_ipif != NULL)
3133 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
3134 	*ire_p = ire;
3135 	if (need_refrele) {
3136 		IRB_REFRELE(irb_ptr);
3137 	}
3138 	return (0);
3139 }
3140 
3141 /*
3142  * IRB_REFRELE is the only caller of the function. ire_unlink calls to
3143  * do the final cleanup for this ire.
3144  */
3145 void
3146 ire_cleanup(ire_t *ire)
3147 {
3148 	ire_t *ire_next;
3149 	ip_stack_t *ipst = ire->ire_ipst;
3150 
3151 	ASSERT(ire != NULL);
3152 
3153 	while (ire != NULL) {
3154 		ire_next = ire->ire_next;
3155 		if (ire->ire_ipversion == IPV4_VERSION) {
3156 			ire_delete_v4(ire);
3157 			BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
3158 			    ire_stats_deleted);
3159 		} else {
3160 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
3161 			ire_delete_v6(ire);
3162 			BUMP_IRE_STATS(ipst->ips_ire_stats_v6,
3163 			    ire_stats_deleted);
3164 		}
3165 		/*
3166 		 * Now it's really out of the list. Before doing the
3167 		 * REFRELE, set ire_next to NULL as ire_inactive asserts
3168 		 * so.
3169 		 */
3170 		ire->ire_next = NULL;
3171 		IRE_REFRELE_NOTR(ire);
3172 		ire = ire_next;
3173 	}
3174 }
3175 
3176 /*
3177  * IRB_REFRELE is the only caller of the function. It calls to unlink
3178  * all the CONDEMNED ires from this bucket.
3179  */
3180 ire_t *
3181 ire_unlink(irb_t *irb)
3182 {
3183 	ire_t *ire;
3184 	ire_t *ire1;
3185 	ire_t **ptpn;
3186 	ire_t *ire_list = NULL;
3187 
3188 	ASSERT(RW_WRITE_HELD(&irb->irb_lock));
3189 	ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) ||
3190 	    (irb->irb_refcnt == 0));
3191 	ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
3192 	ASSERT(irb->irb_ire != NULL);
3193 
3194 	for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
3195 		ip_stack_t	*ipst = ire->ire_ipst;
3196 
3197 		ire1 = ire->ire_next;
3198 		if (ire->ire_marks & IRE_MARK_CONDEMNED) {
3199 			ptpn = ire->ire_ptpn;
3200 			ire1 = ire->ire_next;
3201 			if (ire1)
3202 				ire1->ire_ptpn = ptpn;
3203 			*ptpn = ire1;
3204 			ire->ire_ptpn = NULL;
3205 			ire->ire_next = NULL;
3206 			if (ire->ire_type == IRE_DEFAULT) {
3207 				/*
3208 				 * IRE is out of the list. We need to adjust
3209 				 * the accounting before the caller drops
3210 				 * the lock.
3211 				 */
3212 				if (ire->ire_ipversion == IPV6_VERSION) {
3213 					ASSERT(ipst->
3214 					    ips_ipv6_ire_default_count !=
3215 					    0);
3216 					ipst->ips_ipv6_ire_default_count--;
3217 				}
3218 			}
3219 			/*
3220 			 * We need to call ire_delete_v4 or ire_delete_v6
3221 			 * to clean up the cache or the redirects pointing at
3222 			 * the default gateway. We need to drop the lock
3223 			 * as ire_flush_cache/ire_delete_host_redircts require
3224 			 * so. But we can't drop the lock, as ire_unlink needs
3225 			 * to atomically remove the ires from the list.
3226 			 * So, create a temporary list of CONDEMNED ires
3227 			 * for doing ire_delete_v4/ire_delete_v6 operations
3228 			 * later on.
3229 			 */
3230 			ire->ire_next = ire_list;
3231 			ire_list = ire;
3232 		}
3233 	}
3234 	irb->irb_marks &= ~IRB_MARK_CONDEMNED;
3235 	return (ire_list);
3236 }
3237 
3238 /*
3239  * Delete all the cache entries with this 'addr'.  When IP gets a gratuitous
3240  * ARP message on any of its interface queue, it scans the nce table and
3241  * deletes and calls ndp_delete() for the appropriate nce. This action
3242  * also deletes all the neighbor/ire cache entries for that address.
3243  * This function is called from ip_arp_news in ip.c and also for
3244  * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns
3245  * true if it finds a nce entry which is used by ip_arp_news to determine if
3246  * it needs to do an ire_walk_v4. The return value is also  used for the
3247  * same purpose by ARP IOCTL processing * in ip_if.c when deleting
3248  * ARP entries. For SIOC*IFARP ioctls in addition to the address,
3249  * ip_if->ipif_ill also needs to be matched.
3250  */
3251 boolean_t
3252 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst)
3253 {
3254 	ill_t	*ill;
3255 	nce_t	*nce;
3256 
3257 	ill = (ipif ? ipif->ipif_ill : NULL);
3258 
3259 	if (ill != NULL) {
3260 		/*
3261 		 * clean up the nce (and any relevant ire's) that matches
3262 		 * on addr and ill.
3263 		 */
3264 		nce = ndp_lookup_v4(ill, &addr, B_FALSE);
3265 		if (nce != NULL) {
3266 			ndp_delete(nce);
3267 			return (B_TRUE);
3268 		}
3269 	} else {
3270 		/*
3271 		 * ill is wildcard. clean up all nce's and
3272 		 * ire's that match on addr
3273 		 */
3274 		nce_clookup_t cl;
3275 
3276 		cl.ncecl_addr = addr;
3277 		cl.ncecl_found = B_FALSE;
3278 
3279 		ndp_walk_common(ipst->ips_ndp4, NULL,
3280 		    (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE);
3281 
3282 		/*
3283 		 *  ncecl_found would be set by ip_nce_clookup_and_delete if
3284 		 *  we found a matching nce.
3285 		 */
3286 		return (cl.ncecl_found);
3287 	}
3288 	return (B_FALSE);
3289 
3290 }
3291 
3292 /* Delete the supplied nce if its nce_addr matches the supplied address */
3293 static void
3294 ip_nce_clookup_and_delete(nce_t *nce, void *arg)
3295 {
3296 	nce_clookup_t *cl = (nce_clookup_t *)arg;
3297 	ipaddr_t nce_addr;
3298 
3299 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3300 	if (nce_addr == cl->ncecl_addr) {
3301 		cl->ncecl_found = B_TRUE;
3302 		/* clean up the nce (and any relevant ire's) */
3303 		ndp_delete(nce);
3304 	}
3305 }
3306 
3307 /*
3308  * Clean up the radix node for this ire. Must be called by IRB_REFRELE
3309  * when there are no ire's left in the bucket. Returns TRUE if the bucket
3310  * is deleted and freed.
3311  */
3312 boolean_t
3313 irb_inactive(irb_t *irb)
3314 {
3315 	struct rt_entry *rt;
3316 	struct radix_node *rn;
3317 	ip_stack_t *ipst = irb->irb_ipst;
3318 
3319 	ASSERT(irb->irb_ipst != NULL);
3320 
3321 	rt = IRB2RT(irb);
3322 	rn = (struct radix_node *)rt;
3323 
3324 	/* first remove it from the radix tree. */
3325 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
3326 	rw_enter(&irb->irb_lock, RW_WRITER);
3327 	if (irb->irb_refcnt == 1 && irb->irb_nire == 0) {
3328 		rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask,
3329 		    ipst->ips_ip_ftable);
3330 		DTRACE_PROBE1(irb__free, rt_t *,  rt);
3331 		ASSERT((void *)rn == (void *)rt);
3332 		Free(rt, rt_entry_cache);
3333 		/* irb_lock is freed */
3334 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
3335 		return (B_TRUE);
3336 	}
3337 	rw_exit(&irb->irb_lock);
3338 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
3339 	return (B_FALSE);
3340 }
3341 
3342 /*
3343  * Delete the specified IRE.
3344  */
3345 void
3346 ire_delete(ire_t *ire)
3347 {
3348 	ire_t	*ire1;
3349 	ire_t	**ptpn;
3350 	irb_t *irb;
3351 	ip_stack_t	*ipst = ire->ire_ipst;
3352 
3353 	if ((irb = ire->ire_bucket) == NULL) {
3354 		/*
3355 		 * It was never inserted in the list. Should call REFRELE
3356 		 * to free this IRE.
3357 		 */
3358 		IRE_REFRELE_NOTR(ire);
3359 		return;
3360 	}
3361 
3362 	rw_enter(&irb->irb_lock, RW_WRITER);
3363 
3364 	if (irb->irb_rr_origin == ire) {
3365 		irb->irb_rr_origin = NULL;
3366 	}
3367 
3368 	/*
3369 	 * In case of V4 we might still be waiting for fastpath ack.
3370 	 */
3371 	if (ire->ire_ipversion == IPV4_VERSION &&
3372 	    (ire->ire_type == IRE_CACHE ||
3373 	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) {
3374 		ASSERT(ire->ire_nce != NULL);
3375 		nce_fastpath_list_delete(ire->ire_nce);
3376 	}
3377 
3378 	if (ire->ire_ptpn == NULL) {
3379 		/*
3380 		 * Some other thread has removed us from the list.
3381 		 * It should have done the REFRELE for us.
3382 		 */
3383 		rw_exit(&irb->irb_lock);
3384 		return;
3385 	}
3386 
3387 	if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
3388 		irb->irb_ire_cnt--;
3389 		ire->ire_marks |= IRE_MARK_CONDEMNED;
3390 		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
3391 			irb->irb_tmp_ire_cnt--;
3392 			ire->ire_marks &= ~IRE_MARK_TEMPORARY;
3393 		}
3394 	}
3395 
3396 	if (irb->irb_refcnt != 0) {
3397 		/*
3398 		 * The last thread to leave this bucket will
3399 		 * delete this ire.
3400 		 */
3401 		irb->irb_marks |= IRB_MARK_CONDEMNED;
3402 		rw_exit(&irb->irb_lock);
3403 		return;
3404 	}
3405 
3406 	/*
3407 	 * Normally to delete an ire, we walk the bucket. While we
3408 	 * walk the bucket, we normally bump up irb_refcnt and hence
3409 	 * we return from above where we mark CONDEMNED and the ire
3410 	 * gets deleted from ire_unlink. This case is where somebody
3411 	 * knows the ire e.g by doing a lookup, and wants to delete the
3412 	 * IRE. irb_refcnt would be 0 in this case if nobody is walking
3413 	 * the bucket.
3414 	 */
3415 	ptpn = ire->ire_ptpn;
3416 	ire1 = ire->ire_next;
3417 	if (ire1 != NULL)
3418 		ire1->ire_ptpn = ptpn;
3419 	ASSERT(ptpn != NULL);
3420 	*ptpn = ire1;
3421 	ire->ire_ptpn = NULL;
3422 	ire->ire_next = NULL;
3423 	if (ire->ire_ipversion == IPV6_VERSION) {
3424 		BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted);
3425 	} else {
3426 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
3427 	}
3428 	/*
3429 	 * ip_wput/ip_wput_v6 checks this flag to see whether
3430 	 * it should still use the cached ire or not.
3431 	 */
3432 	if (ire->ire_type == IRE_DEFAULT) {
3433 		/*
3434 		 * IRE is out of the list. We need to adjust the
3435 		 * accounting before we drop the lock.
3436 		 */
3437 		if (ire->ire_ipversion == IPV6_VERSION) {
3438 			ASSERT(ipst->ips_ipv6_ire_default_count != 0);
3439 			ipst->ips_ipv6_ire_default_count--;
3440 		}
3441 	}
3442 	rw_exit(&irb->irb_lock);
3443 
3444 	if (ire->ire_ipversion == IPV6_VERSION) {
3445 		ire_delete_v6(ire);
3446 	} else {
3447 		ire_delete_v4(ire);
3448 	}
3449 	/*
3450 	 * We removed it from the list. Decrement the
3451 	 * reference count.
3452 	 */
3453 	IRE_REFRELE_NOTR(ire);
3454 }
3455 
3456 /*
3457  * Delete the specified IRE.
3458  * All calls should use ire_delete().
3459  * Sometimes called as writer though not required by this function.
3460  *
3461  * NOTE : This function is called only if the ire was added
3462  * in the list.
3463  */
3464 static void
3465 ire_delete_v4(ire_t *ire)
3466 {
3467 	ip_stack_t	*ipst = ire->ire_ipst;
3468 
3469 	ASSERT(ire->ire_refcnt >= 1);
3470 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
3471 
3472 	if (ire->ire_type != IRE_CACHE)
3473 		ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
3474 	if (ire->ire_type == IRE_DEFAULT) {
3475 		/*
3476 		 * when a default gateway is going away
3477 		 * delete all the host redirects pointing at that
3478 		 * gateway.
3479 		 */
3480 		ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
3481 	}
3482 }
3483 
3484 /*
3485  * IRE_REFRELE/ire_refrele are the only caller of the function. It calls
3486  * to free the ire when the reference count goes to zero.
3487  */
3488 void
3489 ire_inactive(ire_t *ire)
3490 {
3491 	nce_t	*nce;
3492 	ill_t	*ill = NULL;
3493 	ill_t	*stq_ill = NULL;
3494 	ipif_t	*ipif;
3495 	boolean_t	need_wakeup = B_FALSE;
3496 	irb_t 	*irb;
3497 	ip_stack_t	*ipst = ire->ire_ipst;
3498 
3499 	ASSERT(ire->ire_refcnt == 0);
3500 	ASSERT(ire->ire_ptpn == NULL);
3501 	ASSERT(ire->ire_next == NULL);
3502 
3503 	if (ire->ire_gw_secattr != NULL) {
3504 		ire_gw_secattr_free(ire->ire_gw_secattr);
3505 		ire->ire_gw_secattr = NULL;
3506 	}
3507 
3508 	if (ire->ire_mp != NULL) {
3509 		ASSERT(ire->ire_bucket == NULL);
3510 		mutex_destroy(&ire->ire_lock);
3511 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
3512 		if (ire->ire_nce != NULL)
3513 			NCE_REFRELE_NOTR(ire->ire_nce);
3514 		freeb(ire->ire_mp);
3515 		return;
3516 	}
3517 
3518 	if ((nce = ire->ire_nce) != NULL) {
3519 		NCE_REFRELE_NOTR(nce);
3520 		ire->ire_nce = NULL;
3521 	}
3522 
3523 	if (ire->ire_ipif == NULL)
3524 		goto end;
3525 
3526 	ipif = ire->ire_ipif;
3527 	ill = ipif->ipif_ill;
3528 
3529 	if (ire->ire_bucket == NULL) {
3530 		/* The ire was never inserted in the table. */
3531 		goto end;
3532 	}
3533 
3534 	/*
3535 	 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is
3536 	 * non-null ill_ire_count also goes down by 1.
3537 	 *
3538 	 * The ipif that is associated with an ire is ire->ire_ipif and
3539 	 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
3540 	 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as
3541 	 * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
3542 	 * However, for VNI or IPMP IRE entries, stq_ill can be different.
3543 	 * If this is different from ire->ire_ipif->ipif_ill and if the
3544 	 * ill_ire_cnt on the stq_ill also has dropped to zero, we call
3545 	 * ipif_ill_refrele_tail on the stq_ill.
3546 	 */
3547 	if (ire->ire_stq != NULL)
3548 		stq_ill = ire->ire_stq->q_ptr;
3549 
3550 	if (stq_ill == NULL || stq_ill == ill) {
3551 		/* Optimize the most common case */
3552 		mutex_enter(&ill->ill_lock);
3553 		ASSERT(ipif->ipif_ire_cnt != 0);
3554 		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
3555 		    (char *), "ire", (void *), ire);
3556 		ipif->ipif_ire_cnt--;
3557 		if (IPIF_DOWN_OK(ipif))
3558 			need_wakeup = B_TRUE;
3559 		if (stq_ill != NULL) {
3560 			ASSERT(stq_ill->ill_ire_cnt != 0);
3561 			DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
3562 			    (char *), "ire", (void *), ire);
3563 			stq_ill->ill_ire_cnt--;
3564 			if (ILL_DOWN_OK(stq_ill))
3565 				need_wakeup = B_TRUE;
3566 		}
3567 		if (need_wakeup) {
3568 			/* Drops the ill lock */
3569 			ipif_ill_refrele_tail(ill);
3570 		} else {
3571 			mutex_exit(&ill->ill_lock);
3572 		}
3573 	} else {
3574 		/*
3575 		 * We can't grab all the ill locks at the same time.
3576 		 * It can lead to recursive lock enter in the call to
3577 		 * ipif_ill_refrele_tail and later. Instead do it 1 at
3578 		 * a time.
3579 		 */
3580 		mutex_enter(&ill->ill_lock);
3581 		ASSERT(ipif->ipif_ire_cnt != 0);
3582 		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
3583 		    (char *), "ire", (void *), ire);
3584 		ipif->ipif_ire_cnt--;
3585 		if (IPIF_DOWN_OK(ipif)) {
3586 			/* Drops the lock */
3587 			ipif_ill_refrele_tail(ill);
3588 		} else {
3589 			mutex_exit(&ill->ill_lock);
3590 		}
3591 		if (stq_ill != NULL) {
3592 			mutex_enter(&stq_ill->ill_lock);
3593 			ASSERT(stq_ill->ill_ire_cnt != 0);
3594 			DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
3595 			    (char *), "ire", (void *), ire);
3596 			stq_ill->ill_ire_cnt--;
3597 			if (ILL_DOWN_OK(stq_ill)) {
3598 				/* Drops the ill lock */
3599 				ipif_ill_refrele_tail(stq_ill);
3600 			} else {
3601 				mutex_exit(&stq_ill->ill_lock);
3602 			}
3603 		}
3604 	}
3605 end:
3606 	/* This should be true for both V4 and V6 */
3607 
3608 	if ((ire->ire_type & IRE_FORWARDTABLE) &&
3609 	    (ire->ire_ipversion == IPV4_VERSION) &&
3610 	    ((irb = ire->ire_bucket) != NULL)) {
3611 		rw_enter(&irb->irb_lock, RW_WRITER);
3612 		irb->irb_nire--;
3613 		/*
3614 		 * Instead of examining the conditions for freeing
3615 		 * the radix node here, we do it by calling
3616 		 * IRB_REFRELE which is a single point in the code
3617 		 * that embeds that logic. Bump up the refcnt to
3618 		 * be able to call IRB_REFRELE
3619 		 */
3620 		IRB_REFHOLD_LOCKED(irb);
3621 		rw_exit(&irb->irb_lock);
3622 		IRB_REFRELE(irb);
3623 	}
3624 	ire->ire_ipif = NULL;
3625 
3626 #ifdef DEBUG
3627 	ire_trace_cleanup(ire);
3628 #endif
3629 	mutex_destroy(&ire->ire_lock);
3630 	if (ire->ire_ipversion == IPV6_VERSION) {
3631 		BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed);
3632 	} else {
3633 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
3634 	}
3635 	ASSERT(ire->ire_mp == NULL);
3636 	/* Has been allocated out of the cache */
3637 	kmem_cache_free(ire_cache, ire);
3638 }
3639 
3640 /*
3641  * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect
3642  * entries that have a given gateway address.
3643  */
3644 void
3645 ire_delete_cache_gw(ire_t *ire, char *cp)
3646 {
3647 	ipaddr_t	gw_addr;
3648 
3649 	if (!(ire->ire_type & IRE_CACHE) &&
3650 	    !(ire->ire_flags & RTF_DYNAMIC))
3651 		return;
3652 
3653 	bcopy(cp, &gw_addr, sizeof (gw_addr));
3654 	if (ire->ire_gateway_addr == gw_addr) {
3655 		ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n",
3656 		    (int)ntohl(ire->ire_addr), ire->ire_type,
3657 		    (int)ntohl(ire->ire_gateway_addr)));
3658 		ire_delete(ire);
3659 	}
3660 }
3661 
3662 /*
3663  * Remove all IRE_CACHE entries that match the ire specified.
3664  *
3665  * The flag argument indicates if the flush request is due to addition
3666  * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE).
3667  *
3668  * This routine takes only the IREs from the forwarding table and flushes
3669  * the corresponding entries from the cache table.
3670  *
3671  * When flushing due to the deletion of an old route, it
3672  * just checks the cache handles (ire_phandle and ire_ihandle) and
3673  * deletes the ones that match.
3674  *
3675  * When flushing due to the creation of a new route, it checks
3676  * if a cache entry's address matches the one in the IRE and
3677  * that the cache entry's parent has a less specific mask than the
3678  * one in IRE. The destination of such a cache entry could be the
3679  * gateway for other cache entries, so we need to flush those as
3680  * well by looking for gateway addresses matching the IRE's address.
3681  */
3682 void
3683 ire_flush_cache_v4(ire_t *ire, int flag)
3684 {
3685 	int i;
3686 	ire_t *cire;
3687 	irb_t *irb;
3688 	ip_stack_t	*ipst = ire->ire_ipst;
3689 
3690 	if (ire->ire_type & IRE_CACHE)
3691 		return;
3692 
3693 	/*
3694 	 * If a default is just created, there is no point
3695 	 * in going through the cache, as there will not be any
3696 	 * cached ires.
3697 	 */
3698 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
3699 		return;
3700 	if (flag == IRE_FLUSH_ADD) {
3701 		/*
3702 		 * This selective flush is due to the addition of
3703 		 * new IRE.
3704 		 */
3705 		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
3706 			irb = &ipst->ips_ip_cache_table[i];
3707 			if ((cire = irb->irb_ire) == NULL)
3708 				continue;
3709 			IRB_REFHOLD(irb);
3710 			for (cire = irb->irb_ire; cire != NULL;
3711 			    cire = cire->ire_next) {
3712 				if (cire->ire_type != IRE_CACHE)
3713 					continue;
3714 				/*
3715 				 * If 'cire' belongs to the same subnet
3716 				 * as the new ire being added, and 'cire'
3717 				 * is derived from a prefix that is less
3718 				 * specific than the new ire being added,
3719 				 * we need to flush 'cire'; for instance,
3720 				 * when a new interface comes up.
3721 				 */
3722 				if (((cire->ire_addr & ire->ire_mask) ==
3723 				    (ire->ire_addr & ire->ire_mask)) &&
3724 				    (ip_mask_to_plen(cire->ire_cmask) <=
3725 				    ire->ire_masklen)) {
3726 					ire_delete(cire);
3727 					continue;
3728 				}
3729 				/*
3730 				 * This is the case when the ire_gateway_addr
3731 				 * of 'cire' belongs to the same subnet as
3732 				 * the new ire being added.
3733 				 * Flushing such ires is sometimes required to
3734 				 * avoid misrouting: say we have a machine with
3735 				 * two interfaces (I1 and I2), a default router
3736 				 * R on the I1 subnet, and a host route to an
3737 				 * off-link destination D with a gateway G on
3738 				 * the I2 subnet.
3739 				 * Under normal operation, we will have an
3740 				 * on-link cache entry for G and an off-link
3741 				 * cache entry for D with G as ire_gateway_addr,
3742 				 * traffic to D will reach its destination
3743 				 * through gateway G.
3744 				 * If the administrator does 'ifconfig I2 down',
3745 				 * the cache entries for D and G will be
3746 				 * flushed. However, G will now be resolved as
3747 				 * an off-link destination using R (the default
3748 				 * router) as gateway. Then D will also be
3749 				 * resolved as an off-link destination using G
3750 				 * as gateway - this behavior is due to
3751 				 * compatibility reasons, see comment in
3752 				 * ire_ihandle_lookup_offlink(). Traffic to D
3753 				 * will go to the router R and probably won't
3754 				 * reach the destination.
3755 				 * The administrator then does 'ifconfig I2 up'.
3756 				 * Since G is on the I2 subnet, this routine
3757 				 * will flush its cache entry. It must also
3758 				 * flush the cache entry for D, otherwise
3759 				 * traffic will stay misrouted until the IRE
3760 				 * times out.
3761 				 */
3762 				if ((cire->ire_gateway_addr & ire->ire_mask) ==
3763 				    (ire->ire_addr & ire->ire_mask)) {
3764 					ire_delete(cire);
3765 					continue;
3766 				}
3767 			}
3768 			IRB_REFRELE(irb);
3769 		}
3770 	} else {
3771 		/*
3772 		 * delete the cache entries based on
3773 		 * handle in the IRE as this IRE is
3774 		 * being deleted/changed.
3775 		 */
3776 		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
3777 			irb = &ipst->ips_ip_cache_table[i];
3778 			if ((cire = irb->irb_ire) == NULL)
3779 				continue;
3780 			IRB_REFHOLD(irb);
3781 			for (cire = irb->irb_ire; cire != NULL;
3782 			    cire = cire->ire_next) {
3783 				if (cire->ire_type != IRE_CACHE)
3784 					continue;
3785 				if ((cire->ire_phandle == 0 ||
3786 				    cire->ire_phandle != ire->ire_phandle) &&
3787 				    (cire->ire_ihandle == 0 ||
3788 				    cire->ire_ihandle != ire->ire_ihandle))
3789 					continue;
3790 				ire_delete(cire);
3791 			}
3792 			IRB_REFRELE(irb);
3793 		}
3794 	}
3795 }
3796 
3797 /*
3798  * Matches the arguments passed with the values in the ire.
3799  *
3800  * Note: for match types that match using "ipif" passed in, ipif
3801  * must be checked for non-NULL before calling this routine.
3802  */
3803 boolean_t
3804 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
3805     int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
3806     const ts_label_t *tsl, int match_flags, queue_t *wq)
3807 {
3808 	ill_t *ire_ill = NULL, *dst_ill;
3809 	ill_t *ipif_ill = NULL;
3810 
3811 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
3812 	ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
3813 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
3814 	    (ipif != NULL && !ipif->ipif_isv6));
3815 	ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
3816 
3817 	/*
3818 	 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
3819 	 * is in fact hidden, to ensure the caller gets the right one.  One
3820 	 * exception: if the caller passed MATCH_IRE_IHANDLE, then they
3821 	 * already know the identity of the given IRE_INTERFACE entry and
3822 	 * there's no point trying to hide it from them.
3823 	 */
3824 	if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
3825 		if (match_flags & MATCH_IRE_IHANDLE)
3826 			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
3827 
3828 		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
3829 			return (B_FALSE);
3830 	}
3831 
3832 	/*
3833 	 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
3834 	 * is used. In that case the routing table is bypassed and the
3835 	 * packets are sent directly to the specified nexthop. The
3836 	 * IRE_CACHE entry representing this route should be marked
3837 	 * with IRE_MARK_PRIVATE_ADDR.
3838 	 */
3839 
3840 	if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) &&
3841 	    (ire->ire_marks & IRE_MARK_PRIVATE_ADDR))
3842 		return (B_FALSE);
3843 
3844 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
3845 	    ire->ire_zoneid != ALL_ZONES) {
3846 		/*
3847 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
3848 		 * valid and does not match that of ire_zoneid, a failure to
3849 		 * match is reported at this point. Otherwise, since some IREs
3850 		 * that are available in the global zone can be used in local
3851 		 * zones, additional checks need to be performed:
3852 		 *
3853 		 *	IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK
3854 		 *	entries should never be matched in this situation.
3855 		 *
3856 		 *	IRE entries that have an interface associated with them
3857 		 *	should in general not match unless they are an IRE_LOCAL
3858 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
3859 		 *	the caller.  In the case of the former, checking of the
3860 		 *	other fields supplied should take place.
3861 		 *
3862 		 *	In the case where MATCH_IRE_DEFAULT has been set,
3863 		 *	all of the ipif's associated with the IRE's ill are
3864 		 *	checked to see if there is a matching zoneid.  If any
3865 		 *	one ipif has a matching zoneid, this IRE is a
3866 		 *	potential candidate so checking of the other fields
3867 		 *	takes place.
3868 		 *
3869 		 *	In the case where the IRE_INTERFACE has a usable source
3870 		 *	address (indicated by ill_usesrc_ifindex) in the
3871 		 *	correct zone then it's permitted to return this IRE
3872 		 */
3873 		if (match_flags & MATCH_IRE_ZONEONLY)
3874 			return (B_FALSE);
3875 		if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK))
3876 			return (B_FALSE);
3877 		/*
3878 		 * Note, IRE_INTERFACE can have the stq as NULL. For
3879 		 * example, if the default multicast route is tied to
3880 		 * the loopback address.
3881 		 */
3882 		if ((ire->ire_type & IRE_INTERFACE) &&
3883 		    (ire->ire_stq != NULL)) {
3884 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
3885 			/*
3886 			 * If there is a usable source address in the
3887 			 * zone, then it's ok to return an
3888 			 * IRE_INTERFACE
3889 			 */
3890 			if (ipif_usesrc_avail(dst_ill, zoneid)) {
3891 				ip3dbg(("ire_match_args: dst_ill %p match %d\n",
3892 				    (void *)dst_ill,
3893 				    (ire->ire_addr == (addr & mask))));
3894 			} else {
3895 				ip3dbg(("ire_match_args: src_ipif NULL"
3896 				    " dst_ill %p\n", (void *)dst_ill));
3897 				return (B_FALSE);
3898 			}
3899 		}
3900 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
3901 		    !(ire->ire_type & IRE_INTERFACE)) {
3902 			ipif_t	*tipif;
3903 
3904 			if ((match_flags & MATCH_IRE_DEFAULT) == 0) {
3905 				return (B_FALSE);
3906 			}
3907 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
3908 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
3909 			    tipif != NULL; tipif = tipif->ipif_next) {
3910 				if (IPIF_CAN_LOOKUP(tipif) &&
3911 				    (tipif->ipif_flags & IPIF_UP) &&
3912 				    (tipif->ipif_zoneid == zoneid ||
3913 				    tipif->ipif_zoneid == ALL_ZONES))
3914 					break;
3915 			}
3916 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
3917 			if (tipif == NULL) {
3918 				return (B_FALSE);
3919 			}
3920 		}
3921 	}
3922 
3923 	/*
3924 	 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
3925 	 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
3926 	 * of getting a source address -- i.e., ire_src_addr ==
3927 	 * ire->ire_ipif->ipif_src_addr).  ire_to_ill() handles this.
3928 	 *
3929 	 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
3930 	 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
3931 	 * IPMP test traffic), then the ill must match exactly.
3932 	 */
3933 	if (match_flags & MATCH_IRE_ILL) {
3934 		ire_ill = ire_to_ill(ire);
3935 		ipif_ill = ipif->ipif_ill;
3936 	}
3937 
3938 	if ((ire->ire_addr == (addr & mask)) &&
3939 	    ((!(match_flags & MATCH_IRE_GW)) ||
3940 	    (ire->ire_gateway_addr == gateway)) &&
3941 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
3942 	    (ire->ire_type & type)) &&
3943 	    ((!(match_flags & MATCH_IRE_SRC)) ||
3944 	    (ire->ire_src_addr == ipif->ipif_src_addr)) &&
3945 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
3946 	    (ire->ire_ipif == ipif)) &&
3947 	    ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
3948 	    (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
3949 	    ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
3950 	    (ire->ire_type != IRE_CACHE ||
3951 	    ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
3952 	    ((!(match_flags & MATCH_IRE_WQ)) ||
3953 	    (ire->ire_stq == wq)) &&
3954 	    ((!(match_flags & MATCH_IRE_ILL)) ||
3955 	    (ire_ill == ipif_ill ||
3956 	    (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
3957 	    ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
3958 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
3959 	    (ire->ire_ihandle == ihandle)) &&
3960 	    ((!(match_flags & MATCH_IRE_MASK)) ||
3961 	    (ire->ire_mask == mask)) &&
3962 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
3963 	    (!is_system_labeled()) ||
3964 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
3965 		/* We found the matched IRE */
3966 		return (B_TRUE);
3967 	}
3968 	return (B_FALSE);
3969 }
3970 
3971 /*
3972  * Lookup for a route in all the tables
3973  */
3974 ire_t *
3975 ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
3976     int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
3977     const ts_label_t *tsl, int flags, ip_stack_t *ipst)
3978 {
3979 	ire_t *ire = NULL;
3980 
3981 	/*
3982 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
3983 	 * MATCH_IRE_ILL is set.
3984 	 */
3985 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
3986 		return (NULL);
3987 
3988 	/*
3989 	 * might be asking for a cache lookup,
3990 	 * This is not best way to lookup cache,
3991 	 * user should call ire_cache_lookup directly.
3992 	 *
3993 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
3994 	 * in the forwarding table, if the applicable type flags were set.
3995 	 */
3996 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
3997 		ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid,
3998 		    tsl, flags, ipst);
3999 		if (ire != NULL)
4000 			return (ire);
4001 	}
4002 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
4003 		ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire,
4004 		    zoneid, 0, tsl, flags, ipst);
4005 	}
4006 	return (ire);
4007 }
4008 
4009 /*
4010  * Delete the IRE cache for the gateway and all IRE caches whose
4011  * ire_gateway_addr points to this gateway, and allow them to
4012  * be created on demand by ip_newroute.
4013  */
4014 void
4015 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
4016 {
4017 	irb_t *irb;
4018 	ire_t *ire;
4019 
4020 	irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
4021 	    ipst->ips_ip_cache_table_size)];
4022 	IRB_REFHOLD(irb);
4023 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
4024 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
4025 			continue;
4026 
4027 		ASSERT(ire->ire_mask == IP_HOST_MASK);
4028 		if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE,
4029 		    NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) {
4030 			ire_delete(ire);
4031 		}
4032 	}
4033 	IRB_REFRELE(irb);
4034 
4035 	ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst);
4036 }
4037 
4038 /*
4039  * Looks up cache table for a route.
4040  * specific lookup can be indicated by
4041  * passing the MATCH_* flags and the
4042  * necessary parameters.
4043  */
4044 ire_t *
4045 ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
4046     zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
4047 {
4048 	ire_ctable_args_t	margs;
4049 
4050 	margs.ict_addr = &addr;
4051 	margs.ict_gateway = &gateway;
4052 	margs.ict_type = type;
4053 	margs.ict_ipif = ipif;
4054 	margs.ict_zoneid = zoneid;
4055 	margs.ict_tsl = tsl;
4056 	margs.ict_flags = flags;
4057 	margs.ict_ipst = ipst;
4058 	margs.ict_wq = NULL;
4059 
4060 	return (ip4_ctable_lookup_impl(&margs));
4061 }
4062 
4063 /*
4064  * Check whether the IRE_LOCAL and the IRE potentially used to transmit
4065  * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
4066  * or part of the same illgrp.  (In the IPMP case, usually the two IREs
4067  * will both belong to the IPMP ill, but exceptions are possible -- e.g.
4068  * if IPMP test addresses are on their own subnet.)
4069  */
4070 boolean_t
4071 ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
4072 {
4073 	ill_t *recv_ill, *xmit_ill;
4074 
4075 	ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
4076 	ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
4077 
4078 	recv_ill = ire_to_ill(ire_local);
4079 	xmit_ill = ire_to_ill(xmit_ire);
4080 
4081 	ASSERT(recv_ill != NULL);
4082 	ASSERT(xmit_ill != NULL);
4083 
4084 	return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
4085 }
4086 
4087 /*
4088  * Check if the IRE_LOCAL uses the same ill as another route would use.
4089  * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
4090  * then we don't allow this IRE_LOCAL to be used.
4091  */
4092 boolean_t
4093 ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
4094     const ts_label_t *tsl, ip_stack_t *ipst)
4095 {
4096 	ire_t		*alt_ire;
4097 	boolean_t	rval;
4098 	int		flags;
4099 
4100 	flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
4101 
4102 	if (ire_local->ire_ipversion == IPV4_VERSION) {
4103 		alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
4104 		    NULL, zoneid, 0, tsl, flags, ipst);
4105 	} else {
4106 		alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
4107 		    NULL, zoneid, 0, tsl, flags, ipst);
4108 	}
4109 
4110 	if (alt_ire == NULL)
4111 		return (B_FALSE);
4112 
4113 	if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
4114 		ire_refrele(alt_ire);
4115 		return (B_FALSE);
4116 	}
4117 	rval = ire_local_same_lan(ire_local, alt_ire);
4118 
4119 	ire_refrele(alt_ire);
4120 	return (rval);
4121 }
4122 
4123 /*
4124  * Lookup cache
4125  *
4126  * In general the zoneid has to match (where ALL_ZONES match all of them).
4127  * But for IRE_LOCAL we also need to handle the case where L2 should
4128  * conceptually loop back the packet. This is necessary since neither
4129  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
4130  * own MAC address. This loopback is needed when the normal
4131  * routes (ignoring IREs with different zoneids) would send out the packet on
4132  * the same ill as the ill with which this IRE_LOCAL is associated.
4133  *
4134  * Earlier versions of this code always matched an IRE_LOCAL independently of
4135  * the zoneid. We preserve that earlier behavior when
4136  * ip_restrict_interzone_loopback is turned off.
4137  */
4138 ire_t *
4139 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
4140     ip_stack_t *ipst)
4141 {
4142 	irb_t *irb_ptr;
4143 	ire_t *ire;
4144 
4145 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
4146 	    ipst->ips_ip_cache_table_size)];
4147 	rw_enter(&irb_ptr->irb_lock, RW_READER);
4148 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
4149 		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
4150 		    IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
4151 			continue;
4152 		}
4153 		if (ire->ire_addr == addr) {
4154 			/*
4155 			 * Finally, check if the security policy has any
4156 			 * restriction on using this route for the specified
4157 			 * message.
4158 			 */
4159 			if (tsl != NULL &&
4160 			    ire->ire_gw_secattr != NULL &&
4161 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
4162 				continue;
4163 			}
4164 
4165 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
4166 			    ire->ire_zoneid == ALL_ZONES) {
4167 				IRE_REFHOLD(ire);
4168 				rw_exit(&irb_ptr->irb_lock);
4169 				return (ire);
4170 			}
4171 
4172 			if (ire->ire_type == IRE_LOCAL) {
4173 				if (ipst->ips_ip_restrict_interzone_loopback &&
4174 				    !ire_local_ok_across_zones(ire, zoneid,
4175 				    &addr, tsl, ipst))
4176 					continue;
4177 
4178 				IRE_REFHOLD(ire);
4179 				rw_exit(&irb_ptr->irb_lock);
4180 				return (ire);
4181 			}
4182 		}
4183 	}
4184 	rw_exit(&irb_ptr->irb_lock);
4185 	return (NULL);
4186 }
4187 
4188 ire_t *
4189 ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
4190 {
4191 	irb_t *irb_ptr;
4192 	ire_t *ire;
4193 
4194 	/*
4195 	 * Look for an ire in the cachetable whose
4196 	 * ire_addr matches the destination.
4197 	 * Since we are being called by forwarding fastpath
4198 	 * no need to check for Trusted Solaris label.
4199 	 */
4200 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
4201 	    dst, ipst->ips_ip_cache_table_size)];
4202 	rw_enter(&irb_ptr->irb_lock, RW_READER);
4203 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
4204 		if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
4205 		    IRE_MARK_PRIVATE_ADDR)) {
4206 			continue;
4207 		}
4208 		if (ire->ire_addr == dst) {
4209 			IRE_REFHOLD(ire);
4210 			rw_exit(&irb_ptr->irb_lock);
4211 			return (ire);
4212 		}
4213 	}
4214 	rw_exit(&irb_ptr->irb_lock);
4215 	return (NULL);
4216 }
4217 
4218 /*
4219  * Locate the interface ire that is tied to the cache ire 'cire' via
4220  * cire->ire_ihandle.
4221  *
4222  * We are trying to create the cache ire for an offlink destn based
4223  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
4224  * as found by ip_newroute(). We are called from ip_newroute() in
4225  * the IRE_CACHE case.
4226  */
4227 ire_t *
4228 ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
4229 {
4230 	ire_t	*ire;
4231 	int	match_flags;
4232 	ipaddr_t gw_addr;
4233 	ipif_t	*gw_ipif;
4234 	ip_stack_t	*ipst = cire->ire_ipst;
4235 
4236 	ASSERT(cire != NULL && pire != NULL);
4237 
4238 	/*
4239 	 * We don't need to specify the zoneid to ire_ftable_lookup() below
4240 	 * because the ihandle refers to an ipif which can be in only one zone.
4241 	 */
4242 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
4243 	if (pire->ire_ipif != NULL)
4244 		match_flags |= MATCH_IRE_ILL;
4245 	/*
4246 	 * We know that the mask of the interface ire equals cire->ire_cmask.
4247 	 * (When ip_newroute() created 'cire' for the gateway it set its
4248 	 * cmask from the interface ire's mask)
4249 	 */
4250 	ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
4251 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
4252 	    NULL, match_flags, ipst);
4253 	if (ire != NULL)
4254 		return (ire);
4255 	/*
4256 	 * If we didn't find an interface ire above, we can't declare failure.
4257 	 * For backwards compatibility, we need to support prefix routes
4258 	 * pointing to next hop gateways that are not on-link.
4259 	 *
4260 	 * Assume we are trying to ping some offlink destn, and we have the
4261 	 * routing table below.
4262 	 *
4263 	 * Eg.	default	- gw1		<--- pire	(line 1)
4264 	 *	gw1	- gw2				(line 2)
4265 	 *	gw2	- hme0				(line 3)
4266 	 *
4267 	 * If we already have a cache ire for gw1 in 'cire', the
4268 	 * ire_ftable_lookup above would have failed, since there is no
4269 	 * interface ire to reach gw1. We will fallthru below.
4270 	 *
4271 	 * Here we duplicate the steps that ire_ftable_lookup() did in
4272 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
4273 	 * The differences are the following
4274 	 * i.   We want the interface ire only, so we call ire_ftable_lookup()
4275 	 *	instead of ire_route_lookup()
4276 	 * ii.  We look for only prefix routes in the 1st call below.
4277 	 * ii.  We want to match on the ihandle in the 2nd call below.
4278 	 */
4279 	match_flags =  MATCH_IRE_TYPE;
4280 	if (pire->ire_ipif != NULL)
4281 		match_flags |= MATCH_IRE_ILL;
4282 	ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
4283 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
4284 	if (ire == NULL)
4285 		return (NULL);
4286 	/*
4287 	 * At this point 'ire' corresponds to the entry shown in line 2.
4288 	 * gw_addr is 'gw2' in the example above.
4289 	 */
4290 	gw_addr = ire->ire_gateway_addr;
4291 	gw_ipif = ire->ire_ipif;
4292 	ire_refrele(ire);
4293 
4294 	match_flags |= MATCH_IRE_IHANDLE;
4295 	ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
4296 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags,
4297 	    ipst);
4298 	return (ire);
4299 }
4300 
4301 /*
4302  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
4303  * ire associated with the specified ipif.
4304  *
4305  * This might occasionally be called when IPIF_UP is not set since
4306  * the IP_MULTICAST_IF as well as creating interface routes
4307  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
4308  *
4309  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
4310  * the ipif, this routine might return NULL.
4311  */
4312 ire_t *
4313 ipif_to_ire(const ipif_t *ipif)
4314 {
4315 	ire_t	*ire;
4316 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
4317 	uint_t	match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
4318 
4319 	/*
4320 	 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
4321 	 * so that they aren't accidentally returned.  However, if the
4322 	 * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
4323 	 */
4324 	if (IS_UNDER_IPMP(ipif->ipif_ill))
4325 		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
4326 
4327 	ASSERT(!ipif->ipif_isv6);
4328 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
4329 		ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK,
4330 		    ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF),
4331 		    ipst);
4332 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4333 		/* In this case we need to lookup destination address. */
4334 		ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
4335 		    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
4336 		    ipst);
4337 	} else {
4338 		ire = ire_ftable_lookup(ipif->ipif_subnet,
4339 		    ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
4340 		    ALL_ZONES, 0, NULL, match_flags, ipst);
4341 	}
4342 	return (ire);
4343 }
4344 
4345 /*
4346  * ire_walk function.
4347  * Count the number of IRE_CACHE entries in different categories.
4348  */
4349 void
4350 ire_cache_count(ire_t *ire, char *arg)
4351 {
4352 	ire_cache_count_t *icc = (ire_cache_count_t *)arg;
4353 
4354 	if (ire->ire_type != IRE_CACHE)
4355 		return;
4356 
4357 	icc->icc_total++;
4358 
4359 	if (ire->ire_ipversion == IPV6_VERSION) {
4360 		mutex_enter(&ire->ire_lock);
4361 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
4362 			mutex_exit(&ire->ire_lock);
4363 			icc->icc_onlink++;
4364 			return;
4365 		}
4366 		mutex_exit(&ire->ire_lock);
4367 	} else {
4368 		if (ire->ire_gateway_addr == 0) {
4369 			icc->icc_onlink++;
4370 			return;
4371 		}
4372 	}
4373 
4374 	ASSERT(ire->ire_ipif != NULL);
4375 	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu)
4376 		icc->icc_pmtu++;
4377 	else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
4378 	    ire->ire_ib_pkt_count)
4379 		icc->icc_offlink++;
4380 	else
4381 		icc->icc_unused++;
4382 }
4383 
4384 /*
4385  * ire_walk function called by ip_trash_ire_reclaim().
4386  * Free a fraction of the IRE_CACHE cache entries. The fractions are
4387  * different for different categories of IRE_CACHE entries.
4388  * A fraction of zero means to not free any in that category.
4389  * Use the hash bucket id plus lbolt as a random number. Thus if the fraction
4390  * is N then every Nth hash bucket chain will be freed.
4391  */
4392 void
4393 ire_cache_reclaim(ire_t *ire, char *arg)
4394 {
4395 	ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg;
4396 	uint_t rand;
4397 	ip_stack_t	*ipst = icr->icr_ipst;
4398 
4399 	if (ire->ire_type != IRE_CACHE)
4400 		return;
4401 
4402 	if (ire->ire_ipversion == IPV6_VERSION) {
4403 		rand = (uint_t)lbolt +
4404 		    IRE_ADDR_HASH_V6(ire->ire_addr_v6,
4405 		    ipst->ips_ip6_cache_table_size);
4406 		mutex_enter(&ire->ire_lock);
4407 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
4408 			mutex_exit(&ire->ire_lock);
4409 			if (icr->icr_onlink != 0 &&
4410 			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
4411 				ire_delete(ire);
4412 				return;
4413 			}
4414 			goto done;
4415 		}
4416 		mutex_exit(&ire->ire_lock);
4417 	} else {
4418 		rand = (uint_t)lbolt +
4419 		    IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size);
4420 		if (ire->ire_gateway_addr == 0) {
4421 			if (icr->icr_onlink != 0 &&
4422 			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
4423 				ire_delete(ire);
4424 				return;
4425 			}
4426 			goto done;
4427 		}
4428 	}
4429 	/* Not onlink IRE */
4430 	ASSERT(ire->ire_ipif != NULL);
4431 	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) {
4432 		/* Use ptmu fraction */
4433 		if (icr->icr_pmtu != 0 &&
4434 		    (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) {
4435 			ire_delete(ire);
4436 			return;
4437 		}
4438 	} else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
4439 	    ire->ire_ib_pkt_count) {
4440 		/* Use offlink fraction */
4441 		if (icr->icr_offlink != 0 &&
4442 		    (rand/icr->icr_offlink)*icr->icr_offlink == rand) {
4443 			ire_delete(ire);
4444 			return;
4445 		}
4446 	} else {
4447 		/* Use unused fraction */
4448 		if (icr->icr_unused != 0 &&
4449 		    (rand/icr->icr_unused)*icr->icr_unused == rand) {
4450 			ire_delete(ire);
4451 			return;
4452 		}
4453 	}
4454 done:
4455 	/*
4456 	 * Update tire_mark so that those that haven't been used since this
4457 	 * reclaim will be considered unused next time we reclaim.
4458 	 */
4459 	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
4460 }
4461 
4462 static void
4463 power2_roundup(uint32_t *value)
4464 {
4465 	int i;
4466 
4467 	for (i = 1; i < 31; i++) {
4468 		if (*value <= (1 << i))
4469 			break;
4470 	}
4471 	*value = (1 << i);
4472 }
4473 
4474 /* Global init for all zones */
4475 void
4476 ip_ire_g_init()
4477 {
4478 	/*
4479 	 * Create ire caches, ire_reclaim()
4480 	 * will give IRE_CACHE back to system when needed.
4481 	 * This needs to be done here before anything else, since
4482 	 * ire_add() expects the cache to be created.
4483 	 */
4484 	ire_cache = kmem_cache_create("ire_cache",
4485 	    sizeof (ire_t), 0, ip_ire_constructor,
4486 	    ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0);
4487 
4488 	rt_entry_cache = kmem_cache_create("rt_entry",
4489 	    sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
4490 
4491 	/*
4492 	 * Have radix code setup kmem caches etc.
4493 	 */
4494 	rn_init();
4495 }
4496 
4497 void
4498 ip_ire_init(ip_stack_t *ipst)
4499 {
4500 	int i;
4501 	uint32_t mem_cnt;
4502 	uint32_t cpu_cnt;
4503 	uint32_t min_cnt;
4504 	pgcnt_t mem_avail;
4505 
4506 	/*
4507 	 * ip_ire_max_bucket_cnt is sized below based on the memory
4508 	 * size and the cpu speed of the machine. This is upper
4509 	 * bounded by the compile time value of ip_ire_max_bucket_cnt
4510 	 * and is lower bounded by the compile time value of
4511 	 * ip_ire_min_bucket_cnt.  Similar logic applies to
4512 	 * ip6_ire_max_bucket_cnt.
4513 	 *
4514 	 * We calculate this for each IP Instances in order to use
4515 	 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are
4516 	 * in effect when the zone is booted.
4517 	 */
4518 	mem_avail = kmem_avail();
4519 	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
4520 	    ip_cache_table_size / sizeof (ire_t);
4521 	cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio;
4522 
4523 	min_cnt = MIN(cpu_cnt, mem_cnt);
4524 	if (min_cnt < ip_ire_min_bucket_cnt)
4525 		min_cnt = ip_ire_min_bucket_cnt;
4526 	if (ip_ire_max_bucket_cnt > min_cnt) {
4527 		ip_ire_max_bucket_cnt = min_cnt;
4528 	}
4529 
4530 	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
4531 	    ip6_cache_table_size / sizeof (ire_t);
4532 	min_cnt = MIN(cpu_cnt, mem_cnt);
4533 	if (min_cnt < ip6_ire_min_bucket_cnt)
4534 		min_cnt = ip6_ire_min_bucket_cnt;
4535 	if (ip6_ire_max_bucket_cnt > min_cnt) {
4536 		ip6_ire_max_bucket_cnt = min_cnt;
4537 	}
4538 
4539 	mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
4540 	mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL);
4541 
4542 	(void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
4543 
4544 	/* Calculate the IPv4 cache table size. */
4545 	ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size,
4546 	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
4547 	    ip_ire_max_bucket_cnt));
4548 	if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size)
4549 		ipst->ips_ip_cache_table_size = ip_max_cache_table_size;
4550 	/*
4551 	 * Make sure that the table size is always a power of 2.  The
4552 	 * hash macro IRE_ADDR_HASH() depends on that.
4553 	 */
4554 	power2_roundup(&ipst->ips_ip_cache_table_size);
4555 
4556 	ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size *
4557 	    sizeof (irb_t), KM_SLEEP);
4558 
4559 	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
4560 		rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL,
4561 		    RW_DEFAULT, NULL);
4562 	}
4563 
4564 	/* Calculate the IPv6 cache table size. */
4565 	ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size,
4566 	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
4567 	    ip6_ire_max_bucket_cnt));
4568 	if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size)
4569 		ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size;
4570 	/*
4571 	 * Make sure that the table size is always a power of 2.  The
4572 	 * hash macro IRE_ADDR_HASH_V6() depends on that.
4573 	 */
4574 	power2_roundup(&ipst->ips_ip6_cache_table_size);
4575 
4576 	ipst->ips_ip_cache_table_v6 = kmem_zalloc(
4577 	    ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP);
4578 
4579 	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
4580 		rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL,
4581 		    RW_DEFAULT, NULL);
4582 	}
4583 
4584 	/*
4585 	 * Make sure that the forwarding table size is a power of 2.
4586 	 * The IRE*_ADDR_HASH() macroes depend on that.
4587 	 */
4588 	ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
4589 	power2_roundup(&ipst->ips_ip6_ftable_hash_size);
4590 
4591 	ipst->ips_ire_handle = 1;
4592 }
4593 
4594 void
4595 ip_ire_g_fini(void)
4596 {
4597 	kmem_cache_destroy(ire_cache);
4598 	kmem_cache_destroy(rt_entry_cache);
4599 
4600 	rn_fini();
4601 }
4602 
4603 void
4604 ip_ire_fini(ip_stack_t *ipst)
4605 {
4606 	int i;
4607 
4608 	/*
4609 	 * Delete all IREs - assumes that the ill/ipifs have
4610 	 * been removed so what remains are just the ftable and IRE_CACHE.
4611 	 */
4612 	ire_walk(ire_delete, NULL, ipst);
4613 
4614 	rn_freehead(ipst->ips_ip_ftable);
4615 	ipst->ips_ip_ftable = NULL;
4616 
4617 	mutex_destroy(&ipst->ips_ire_ft_init_lock);
4618 	mutex_destroy(&ipst->ips_ire_handle_lock);
4619 
4620 	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
4621 		ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL);
4622 		rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock);
4623 	}
4624 	kmem_free(ipst->ips_ip_cache_table,
4625 	    ipst->ips_ip_cache_table_size * sizeof (irb_t));
4626 	ipst->ips_ip_cache_table = NULL;
4627 
4628 	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
4629 		ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL);
4630 		rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock);
4631 	}
4632 	kmem_free(ipst->ips_ip_cache_table_v6,
4633 	    ipst->ips_ip6_cache_table_size * sizeof (irb_t));
4634 	ipst->ips_ip_cache_table_v6 = NULL;
4635 
4636 	for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
4637 		irb_t *ptr;
4638 		int j;
4639 
4640 		if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL)
4641 			continue;
4642 
4643 		for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
4644 			ASSERT(ptr[j].irb_ire == NULL);
4645 			rw_destroy(&ptr[j].irb_lock);
4646 		}
4647 		mi_free(ptr);
4648 		ipst->ips_ip_forwarding_table_v6[i] = NULL;
4649 	}
4650 }
4651 
4652 /*
4653  * Check if another multirt route resolution is needed.
4654  * B_TRUE is returned is there remain a resolvable route,
4655  * or if no route for that dst is resolved yet.
4656  * B_FALSE is returned if all routes for that dst are resolved
4657  * or if the remaining unresolved routes are actually not
4658  * resolvable.
4659  * This only works in the global zone.
4660  */
4661 boolean_t
4662 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
4663 {
4664 	ire_t	*first_fire;
4665 	ire_t	*first_cire;
4666 	ire_t	*fire;
4667 	ire_t	*cire;
4668 	irb_t	*firb;
4669 	irb_t	*cirb;
4670 	int	unres_cnt = 0;
4671 	boolean_t resolvable = B_FALSE;
4672 
4673 	/* Retrieve the first IRE_HOST that matches the destination */
4674 	first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL,
4675 	    NULL, ALL_ZONES, 0, tsl,
4676 	    MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
4677 
4678 	/* No route at all */
4679 	if (first_fire == NULL) {
4680 		return (B_TRUE);
4681 	}
4682 
4683 	firb = first_fire->ire_bucket;
4684 	ASSERT(firb != NULL);
4685 
4686 	/* Retrieve the first IRE_CACHE ire for that destination. */
4687 	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
4688 
4689 	/* No resolved route. */
4690 	if (first_cire == NULL) {
4691 		ire_refrele(first_fire);
4692 		return (B_TRUE);
4693 	}
4694 
4695 	/*
4696 	 * At least one route is resolved. Here we look through the forward
4697 	 * and cache tables, to compare the number of declared routes
4698 	 * with the number of resolved routes. The search for a resolvable
4699 	 * route is performed only if at least one route remains
4700 	 * unresolved.
4701 	 */
4702 	cirb = first_cire->ire_bucket;
4703 	ASSERT(cirb != NULL);
4704 
4705 	/* Count the number of routes to that dest that are declared. */
4706 	IRB_REFHOLD(firb);
4707 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
4708 		if (!(fire->ire_flags & RTF_MULTIRT))
4709 			continue;
4710 		if (fire->ire_addr != dst)
4711 			continue;
4712 		unres_cnt++;
4713 	}
4714 	IRB_REFRELE(firb);
4715 
4716 	/* Then subtract the number of routes to that dst that are resolved */
4717 	IRB_REFHOLD(cirb);
4718 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
4719 		if (!(cire->ire_flags & RTF_MULTIRT))
4720 			continue;
4721 		if (cire->ire_addr != dst)
4722 			continue;
4723 		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
4724 			continue;
4725 		unres_cnt--;
4726 	}
4727 	IRB_REFRELE(cirb);
4728 
4729 	/* At least one route is unresolved; search for a resolvable route. */
4730 	if (unres_cnt > 0)
4731 		resolvable = ire_multirt_lookup(&first_cire, &first_fire,
4732 		    MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst);
4733 
4734 	if (first_fire != NULL)
4735 		ire_refrele(first_fire);
4736 
4737 	if (first_cire != NULL)
4738 		ire_refrele(first_cire);
4739 
4740 	return (resolvable);
4741 }
4742 
4743 /*
4744  * Explore a forward_table bucket, starting from fire_arg.
4745  * fire_arg MUST be an IRE_HOST entry.
4746  *
4747  * Return B_TRUE and update *ire_arg and *fire_arg
4748  * if at least one resolvable route is found. *ire_arg
4749  * is the IRE entry for *fire_arg's gateway.
4750  *
4751  * Return B_FALSE otherwise (all routes are resolved or
4752  * the remaining unresolved routes are all unresolvable).
4753  *
4754  * The IRE selection relies on a priority mechanism
4755  * driven by the flags passed in by the caller.
4756  * The caller, such as ip_newroute_ipif(), can get the most
4757  * relevant ire at each stage of a multiple route resolution.
4758  *
4759  * The rules are:
4760  *
4761  * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE
4762  *   ires are preferred for the gateway. This gives the highest
4763  *   priority to routes that can be resolved without using
4764  *   a resolver.
4765  *
4766  * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW
4767  *   is specified but no IRE_CACHETABLE ire entry for the gateway
4768  *   is found, the following rules apply.
4769  *
4770  * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE
4771  *   ires for the gateway, that have not been tried since
4772  *   a configurable amount of time, are preferred.
4773  *   This applies when a resolver must be invoked for
4774  *   a missing route, but we don't want to use the resolver
4775  *   upon each packet emission. If no such resolver is found,
4776  *   B_FALSE is returned.
4777  *   The MULTIRT_USESTAMP flag can be combined with
4778  *   MULTIRT_CACHEGW.
4779  *
4780  * - if MULTIRT_USESTAMP is not specified in flags, the first
4781  *   unresolved but resolvable route is selected.
4782  *
4783  * - Otherwise, there is no resolvalble route, and
4784  *   B_FALSE is returned.
4785  *
4786  * At last, MULTIRT_SETSTAMP can be specified in flags to
4787  * request the timestamp of unresolvable routes to
4788  * be refreshed. This prevents the useless exploration
4789  * of those routes for a while, when MULTIRT_USESTAMP is used.
4790  *
4791  * This only works in the global zone.
4792  */
4793 boolean_t
4794 ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
4795     const ts_label_t *tsl, ip_stack_t *ipst)
4796 {
4797 	clock_t	delta;
4798 	ire_t	*best_fire = NULL;
4799 	ire_t	*best_cire = NULL;
4800 	ire_t	*first_fire;
4801 	ire_t	*first_cire;
4802 	ire_t	*fire;
4803 	ire_t	*cire;
4804 	irb_t	*firb = NULL;
4805 	irb_t	*cirb = NULL;
4806 	ire_t	*gw_ire;
4807 	boolean_t	already_resolved;
4808 	boolean_t	res;
4809 	ipaddr_t	dst;
4810 	ipaddr_t	gw;
4811 
4812 	ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n",
4813 	    (void *)*ire_arg, (void *)*fire_arg, flags));
4814 
4815 	ASSERT(ire_arg != NULL);
4816 	ASSERT(fire_arg != NULL);
4817 
4818 	/* Not an IRE_HOST ire; give up. */
4819 	if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) {
4820 		return (B_FALSE);
4821 	}
4822 
4823 	/* This is the first IRE_HOST ire for that destination. */
4824 	first_fire = *fire_arg;
4825 	firb = first_fire->ire_bucket;
4826 	ASSERT(firb != NULL);
4827 
4828 	dst = first_fire->ire_addr;
4829 
4830 	ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst)));
4831 
4832 	/*
4833 	 * Retrieve the first IRE_CACHE ire for that destination;
4834 	 * if we don't find one, no route for that dest is
4835 	 * resolved yet.
4836 	 */
4837 	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
4838 	if (first_cire != NULL) {
4839 		cirb = first_cire->ire_bucket;
4840 	}
4841 
4842 	ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire));
4843 
4844 	/*
4845 	 * Search for a resolvable route, giving the top priority
4846 	 * to routes that can be resolved without any call to the resolver.
4847 	 */
4848 	IRB_REFHOLD(firb);
4849 
4850 	if (!CLASSD(dst)) {
4851 		/*
4852 		 * For all multiroute IRE_HOST ires for that destination,
4853 		 * check if the route via the IRE_HOST's gateway is
4854 		 * resolved yet.
4855 		 */
4856 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
4857 
4858 			if (!(fire->ire_flags & RTF_MULTIRT))
4859 				continue;
4860 			if (fire->ire_addr != dst)
4861 				continue;
4862 
4863 			if (fire->ire_gw_secattr != NULL &&
4864 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
4865 				continue;
4866 			}
4867 
4868 			gw = fire->ire_gateway_addr;
4869 
4870 			ip2dbg(("ire_multirt_lookup: fire %p, "
4871 			    "ire_addr %08x, ire_gateway_addr %08x\n",
4872 			    (void *)fire, ntohl(fire->ire_addr), ntohl(gw)));
4873 
4874 			already_resolved = B_FALSE;
4875 
4876 			if (first_cire != NULL) {
4877 				ASSERT(cirb != NULL);
4878 
4879 				IRB_REFHOLD(cirb);
4880 				/*
4881 				 * For all IRE_CACHE ires for that
4882 				 * destination.
4883 				 */
4884 				for (cire = first_cire;
4885 				    cire != NULL;
4886 				    cire = cire->ire_next) {
4887 
4888 					if (!(cire->ire_flags & RTF_MULTIRT))
4889 						continue;
4890 					if (cire->ire_addr != dst)
4891 						continue;
4892 					if (cire->ire_marks &
4893 					    (IRE_MARK_CONDEMNED |
4894 					    IRE_MARK_TESTHIDDEN))
4895 						continue;
4896 
4897 					if (cire->ire_gw_secattr != NULL &&
4898 					    tsol_ire_match_gwattr(cire,
4899 					    tsl) != 0) {
4900 						continue;
4901 					}
4902 
4903 					/*
4904 					 * Check if the IRE_CACHE's gateway
4905 					 * matches the IRE_HOST's gateway.
4906 					 */
4907 					if (cire->ire_gateway_addr == gw) {
4908 						already_resolved = B_TRUE;
4909 						break;
4910 					}
4911 				}
4912 				IRB_REFRELE(cirb);
4913 			}
4914 
4915 			/*
4916 			 * This route is already resolved;
4917 			 * proceed with next one.
4918 			 */
4919 			if (already_resolved) {
4920 				ip2dbg(("ire_multirt_lookup: found cire %p, "
4921 				    "already resolved\n", (void *)cire));
4922 				continue;
4923 			}
4924 
4925 			/*
4926 			 * The route is unresolved; is it actually
4927 			 * resolvable, i.e. is there a cache or a resolver
4928 			 * for the gateway?
4929 			 */
4930 			gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL,
4931 			    ALL_ZONES, tsl,
4932 			    MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst);
4933 
4934 			ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n",
4935 			    (void *)gw_ire));
4936 
4937 			/*
4938 			 * If gw_ire is typed IRE_CACHETABLE,
4939 			 * this route can be resolved without any call to the
4940 			 * resolver. If the MULTIRT_CACHEGW flag is set,
4941 			 * give the top priority to this ire and exit the
4942 			 * loop.
4943 			 * This is typically the case when an ARP reply
4944 			 * is processed through ip_wput_nondata().
4945 			 */
4946 			if ((flags & MULTIRT_CACHEGW) &&
4947 			    (gw_ire != NULL) &&
4948 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
4949 				ASSERT(gw_ire->ire_nce == NULL ||
4950 				    gw_ire->ire_nce->nce_state == ND_REACHABLE);
4951 				/*
4952 				 * Release the resolver associated to the
4953 				 * previous candidate best ire, if any.
4954 				 */
4955 				if (best_cire != NULL) {
4956 					ire_refrele(best_cire);
4957 					ASSERT(best_fire != NULL);
4958 				}
4959 
4960 				best_fire = fire;
4961 				best_cire = gw_ire;
4962 
4963 				ip2dbg(("ire_multirt_lookup: found top prio "
4964 				    "best_fire %p, best_cire %p\n",
4965 				    (void *)best_fire, (void *)best_cire));
4966 				break;
4967 			}
4968 
4969 			/*
4970 			 * Compute the time elapsed since our preceding
4971 			 * attempt to  resolve that route.
4972 			 * If the MULTIRT_USESTAMP flag is set, we take that
4973 			 * route into account only if this time interval
4974 			 * exceeds ip_multirt_resolution_interval;
4975 			 * this prevents us from attempting to resolve a
4976 			 * broken route upon each sending of a packet.
4977 			 */
4978 			delta = lbolt - fire->ire_last_used_time;
4979 			delta = TICK_TO_MSEC(delta);
4980 
4981 			res = (boolean_t)((delta >
4982 			    ipst->ips_ip_multirt_resolution_interval) ||
4983 			    (!(flags & MULTIRT_USESTAMP)));
4984 
4985 			ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, "
4986 			    "res %d\n",
4987 			    (void *)fire, delta, res));
4988 
4989 			if (res) {
4990 				/*
4991 				 * We are here if MULTIRT_USESTAMP flag is set
4992 				 * and the resolver for fire's gateway
4993 				 * has not been tried since
4994 				 * ip_multirt_resolution_interval, or if
4995 				 * MULTIRT_USESTAMP is not set but gw_ire did
4996 				 * not fill the conditions for MULTIRT_CACHEGW,
4997 				 * or if neither MULTIRT_USESTAMP nor
4998 				 * MULTIRT_CACHEGW are set.
4999 				 */
5000 				if (gw_ire != NULL) {
5001 					if (best_fire == NULL) {
5002 						ASSERT(best_cire == NULL);
5003 
5004 						best_fire = fire;
5005 						best_cire = gw_ire;
5006 
5007 						ip2dbg(("ire_multirt_lookup:"
5008 						    "found candidate "
5009 						    "best_fire %p, "
5010 						    "best_cire %p\n",
5011 						    (void *)best_fire,
5012 						    (void *)best_cire));
5013 
5014 						/*
5015 						 * If MULTIRT_CACHEGW is not
5016 						 * set, we ignore the top
5017 						 * priority ires that can
5018 						 * be resolved without any
5019 						 * call to the resolver;
5020 						 * In that case, there is
5021 						 * actually no need
5022 						 * to continue the loop.
5023 						 */
5024 						if (!(flags &
5025 						    MULTIRT_CACHEGW)) {
5026 							break;
5027 						}
5028 						continue;
5029 					}
5030 				} else {
5031 					/*
5032 					 * No resolver for the gateway: the
5033 					 * route is not resolvable.
5034 					 * If the MULTIRT_SETSTAMP flag is
5035 					 * set, we stamp the IRE_HOST ire,
5036 					 * so we will not select it again
5037 					 * during this resolution interval.
5038 					 */
5039 					if (flags & MULTIRT_SETSTAMP)
5040 						fire->ire_last_used_time =
5041 						    lbolt;
5042 				}
5043 			}
5044 
5045 			if (gw_ire != NULL)
5046 				ire_refrele(gw_ire);
5047 		}
5048 	} else { /* CLASSD(dst) */
5049 
5050 		for (fire = first_fire;
5051 		    fire != NULL;
5052 		    fire = fire->ire_next) {
5053 
5054 			if (!(fire->ire_flags & RTF_MULTIRT))
5055 				continue;
5056 			if (fire->ire_addr != dst)
5057 				continue;
5058 
5059 			if (fire->ire_gw_secattr != NULL &&
5060 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
5061 				continue;
5062 			}
5063 
5064 			already_resolved = B_FALSE;
5065 
5066 			gw = fire->ire_gateway_addr;
5067 
5068 			gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE,
5069 			    NULL, NULL, ALL_ZONES, 0, tsl,
5070 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
5071 			    MATCH_IRE_SECATTR, ipst);
5072 
5073 			/* No resolver for the gateway; we skip this ire. */
5074 			if (gw_ire == NULL) {
5075 				continue;
5076 			}
5077 			ASSERT(gw_ire->ire_nce == NULL ||
5078 			    gw_ire->ire_nce->nce_state == ND_REACHABLE);
5079 
5080 			if (first_cire != NULL) {
5081 
5082 				IRB_REFHOLD(cirb);
5083 				/*
5084 				 * For all IRE_CACHE ires for that
5085 				 * destination.
5086 				 */
5087 				for (cire = first_cire;
5088 				    cire != NULL;
5089 				    cire = cire->ire_next) {
5090 
5091 					if (!(cire->ire_flags & RTF_MULTIRT))
5092 						continue;
5093 					if (cire->ire_addr != dst)
5094 						continue;
5095 					if (cire->ire_marks &
5096 					    (IRE_MARK_CONDEMNED |
5097 					    IRE_MARK_TESTHIDDEN))
5098 						continue;
5099 
5100 					if (cire->ire_gw_secattr != NULL &&
5101 					    tsol_ire_match_gwattr(cire,
5102 					    tsl) != 0) {
5103 						continue;
5104 					}
5105 
5106 					/*
5107 					 * Cache entries are linked to the
5108 					 * parent routes using the parent handle
5109 					 * (ire_phandle). If no cache entry has
5110 					 * the same handle as fire, fire is
5111 					 * still unresolved.
5112 					 */
5113 					ASSERT(cire->ire_phandle != 0);
5114 					if (cire->ire_phandle ==
5115 					    fire->ire_phandle) {
5116 						already_resolved = B_TRUE;
5117 						break;
5118 					}
5119 				}
5120 				IRB_REFRELE(cirb);
5121 			}
5122 
5123 			/*
5124 			 * This route is already resolved; proceed with
5125 			 * next one.
5126 			 */
5127 			if (already_resolved) {
5128 				ire_refrele(gw_ire);
5129 				continue;
5130 			}
5131 
5132 			/*
5133 			 * Compute the time elapsed since our preceding
5134 			 * attempt to resolve that route.
5135 			 * If the MULTIRT_USESTAMP flag is set, we take
5136 			 * that route into account only if this time
5137 			 * interval exceeds ip_multirt_resolution_interval;
5138 			 * this prevents us from attempting to resolve a
5139 			 * broken route upon each sending of a packet.
5140 			 */
5141 			delta = lbolt - fire->ire_last_used_time;
5142 			delta = TICK_TO_MSEC(delta);
5143 
5144 			res = (boolean_t)((delta >
5145 			    ipst->ips_ip_multirt_resolution_interval) ||
5146 			    (!(flags & MULTIRT_USESTAMP)));
5147 
5148 			ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, "
5149 			    "flags %04x, res %d\n",
5150 			    (void *)fire, delta, flags, res));
5151 
5152 			if (res) {
5153 				if (best_cire != NULL) {
5154 					/*
5155 					 * Release the resolver associated
5156 					 * to the preceding candidate best
5157 					 * ire, if any.
5158 					 */
5159 					ire_refrele(best_cire);
5160 					ASSERT(best_fire != NULL);
5161 				}
5162 				best_fire = fire;
5163 				best_cire = gw_ire;
5164 				continue;
5165 			}
5166 
5167 			ire_refrele(gw_ire);
5168 		}
5169 	}
5170 
5171 	if (best_fire != NULL) {
5172 		IRE_REFHOLD(best_fire);
5173 	}
5174 	IRB_REFRELE(firb);
5175 
5176 	/* Release the first IRE_CACHE we initially looked up, if any. */
5177 	if (first_cire != NULL)
5178 		ire_refrele(first_cire);
5179 
5180 	/* Found a resolvable route. */
5181 	if (best_fire != NULL) {
5182 		ASSERT(best_cire != NULL);
5183 
5184 		if (*fire_arg != NULL)
5185 			ire_refrele(*fire_arg);
5186 		if (*ire_arg != NULL)
5187 			ire_refrele(*ire_arg);
5188 
5189 		/*
5190 		 * Update the passed-in arguments with the
5191 		 * resolvable multirt route we found.
5192 		 */
5193 		*fire_arg = best_fire;
5194 		*ire_arg = best_cire;
5195 
5196 		ip2dbg(("ire_multirt_lookup: returning B_TRUE, "
5197 		    "*fire_arg %p, *ire_arg %p\n",
5198 		    (void *)best_fire, (void *)best_cire));
5199 
5200 		return (B_TRUE);
5201 	}
5202 
5203 	ASSERT(best_cire == NULL);
5204 
5205 	ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, "
5206 	    "*ire_arg %p\n",
5207 	    (void *)*fire_arg, (void *)*ire_arg));
5208 
5209 	/* No resolvable route. */
5210 	return (B_FALSE);
5211 }
5212 
5213 /*
5214  * IRE iterator for inbound and loopback broadcast processing.
5215  * Given an IRE_BROADCAST ire, walk the ires with the same destination
5216  * address, but skip over the passed-in ire. Returns the next ire without
5217  * a hold - assumes that the caller holds a reference on the IRE bucket.
5218  */
5219 ire_t *
5220 ire_get_next_bcast_ire(ire_t *curr, ire_t *ire)
5221 {
5222 	ill_t *ill;
5223 
5224 	if (curr == NULL) {
5225 		for (curr = ire->ire_bucket->irb_ire; curr != NULL;
5226 		    curr = curr->ire_next) {
5227 			if (curr->ire_addr == ire->ire_addr)
5228 				break;
5229 		}
5230 	} else {
5231 		curr = curr->ire_next;
5232 	}
5233 	ill = ire_to_ill(ire);
5234 	for (; curr != NULL; curr = curr->ire_next) {
5235 		if (curr->ire_addr != ire->ire_addr) {
5236 			/*
5237 			 * All the IREs to a given destination are contiguous;
5238 			 * break out once the address doesn't match.
5239 			 */
5240 			break;
5241 		}
5242 		if (curr == ire) {
5243 			/* skip over the passed-in ire */
5244 			continue;
5245 		}
5246 		if ((curr->ire_stq != NULL && ire->ire_stq == NULL) ||
5247 		    (curr->ire_stq == NULL && ire->ire_stq != NULL)) {
5248 			/*
5249 			 * If the passed-in ire is loopback, skip over
5250 			 * non-loopback ires and vice versa.
5251 			 */
5252 			continue;
5253 		}
5254 		if (ire_to_ill(curr) != ill) {
5255 			/* skip over IREs going through a different interface */
5256 			continue;
5257 		}
5258 		if (curr->ire_marks & IRE_MARK_CONDEMNED) {
5259 			/* skip over deleted IREs */
5260 			continue;
5261 		}
5262 		return (curr);
5263 	}
5264 	return (NULL);
5265 }
5266 
5267 #ifdef DEBUG
5268 void
5269 ire_trace_ref(ire_t *ire)
5270 {
5271 	mutex_enter(&ire->ire_lock);
5272 	if (ire->ire_trace_disable) {
5273 		mutex_exit(&ire->ire_lock);
5274 		return;
5275 	}
5276 
5277 	if (th_trace_ref(ire, ire->ire_ipst)) {
5278 		mutex_exit(&ire->ire_lock);
5279 	} else {
5280 		ire->ire_trace_disable = B_TRUE;
5281 		mutex_exit(&ire->ire_lock);
5282 		ire_trace_cleanup(ire);
5283 	}
5284 }
5285 
5286 void
5287 ire_untrace_ref(ire_t *ire)
5288 {
5289 	mutex_enter(&ire->ire_lock);
5290 	if (!ire->ire_trace_disable)
5291 		th_trace_unref(ire);
5292 	mutex_exit(&ire->ire_lock);
5293 }
5294 
5295 static void
5296 ire_trace_cleanup(const ire_t *ire)
5297 {
5298 	th_trace_cleanup(ire, ire->ire_trace_disable);
5299 }
5300 #endif /* DEBUG */
5301 
5302 /*
5303  * Generate a message chain with an arp request to resolve the in_ire.
5304  * It is assumed that in_ire itself is currently in the ire cache table,
5305  * so we create a fake_ire filled with enough information about ire_addr etc.
5306  * to retrieve in_ire when the DL_UNITDATA response from the resolver
5307  * comes back. The fake_ire itself is created by calling esballoc with
5308  * the fr_rtnp (free routine) set to ire_freemblk. This routine will be
5309  * invoked when the mblk containing fake_ire is freed.
5310  */
5311 void
5312 ire_arpresolve(ire_t *in_ire)
5313 {
5314 	areq_t		*areq;
5315 	ipaddr_t	*addrp;
5316 	mblk_t 		*ire_mp, *areq_mp;
5317 	ire_t 		*ire, *buf;
5318 	size_t		bufsize;
5319 	frtn_t		*frtnp;
5320 	ill_t		*dst_ill;
5321 	ip_stack_t	*ipst;
5322 
5323 	ASSERT(in_ire->ire_nce != NULL);
5324 
5325 	dst_ill = ire_to_ill(in_ire);
5326 	ipst = dst_ill->ill_ipst;
5327 
5328 	/*
5329 	 * Construct message chain for the resolver
5330 	 * of the form:
5331 	 *	ARP_REQ_MBLK-->IRE_MBLK
5332 	 *
5333 	 * NOTE : If the response does not
5334 	 * come back, ARP frees the packet. For this reason,
5335 	 * we can't REFHOLD the bucket of save_ire to prevent
5336 	 * deletions. We may not be able to REFRELE the bucket
5337 	 * if the response never comes back. Thus, before
5338 	 * adding the ire, ire_add_v4 will make sure that the
5339 	 * interface route does not get deleted. This is the
5340 	 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
5341 	 * where we can always prevent deletions because of
5342 	 * the synchronous nature of adding IRES i.e
5343 	 * ire_add_then_send is called after creating the IRE.
5344 	 */
5345 
5346 	/*
5347 	 * We use esballoc to allocate the second part (IRE_MBLK)
5348 	 * of the message chain depicted above.  This mblk will be freed
5349 	 * by arp when there is a timeout, and otherwise passed to IP
5350 	 * and IP will free it after processing the ARP response.
5351 	 */
5352 
5353 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
5354 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
5355 	if (buf == NULL) {
5356 		ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
5357 		return;
5358 	}
5359 	frtnp = (frtn_t *)(buf + 1);
5360 	frtnp->free_arg = (caddr_t)buf;
5361 	frtnp->free_func = ire_freemblk;
5362 
5363 	ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
5364 	if (ire_mp == NULL) {
5365 		ip1dbg(("ire_arpresolve: esballoc failed\n"));
5366 		kmem_free(buf, bufsize);
5367 		return;
5368 	}
5369 
5370 	areq_mp = copyb(dst_ill->ill_resolver_mp);
5371 	if (areq_mp == NULL) {
5372 		freemsg(ire_mp);
5373 		return;
5374 	}
5375 
5376 	ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE;
5377 	ire = (ire_t *)buf;
5378 	/*
5379 	 * keep enough info in the fake ire so that we can pull up
5380 	 * the incomplete ire (in_ire) after result comes back from
5381 	 * arp and make it complete.
5382 	 */
5383 	*ire = ire_null;
5384 	ire->ire_u = in_ire->ire_u;
5385 	ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
5386 	ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
5387 	ire->ire_ipif = in_ire->ire_ipif;
5388 	ire->ire_stq = dst_ill->ill_wq;
5389 	ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
5390 	ire->ire_zoneid = in_ire->ire_zoneid;
5391 	ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
5392 	ire->ire_ipst = ipst;
5393 
5394 	/*
5395 	 * ire_freemblk will be called when ire_mp is freed, both for
5396 	 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set
5397 	 * when the arp resolution failed.
5398 	 */
5399 	ire->ire_marks |= IRE_MARK_UNCACHED;
5400 	ire->ire_mp = ire_mp;
5401 	ire_mp->b_wptr = (uchar_t *)&ire[1];
5402 	ire_mp->b_cont = NULL;
5403 	linkb(areq_mp, ire_mp);
5404 
5405 	/*
5406 	 * Fill in the source and dest addrs for the resolver.
5407 	 * NOTE: this depends on memory layouts imposed by
5408 	 * ill_init().
5409 	 */
5410 	areq = (areq_t *)areq_mp->b_rptr;
5411 	addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset);
5412 	*addrp = ire->ire_src_addr;
5413 
5414 	addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset);
5415 	if (ire->ire_gateway_addr != INADDR_ANY) {
5416 		*addrp = ire->ire_gateway_addr;
5417 	} else {
5418 		*addrp = ire->ire_addr;
5419 	}
5420 
5421 	/* Up to the resolver. */
5422 	if (canputnext(dst_ill->ill_rq)) {
5423 		putnext(dst_ill->ill_rq, areq_mp);
5424 	} else {
5425 		freemsg(areq_mp);
5426 	}
5427 }
5428 
5429 /*
5430  * Esballoc free function for AR_ENTRY_QUERY request to clean up any
5431  * unresolved ire_t and/or nce_t structures when ARP resolution fails.
5432  *
5433  * This function can be called by ARP via free routine for ire_mp or
5434  * by IPv4(both host and forwarding path) via ire_delete
5435  * in case ARP resolution fails.
5436  * NOTE: Since IP is MT, ARP can call into IP but not vice versa
5437  * (for IP to talk to ARP, it still has to send AR* messages).
5438  *
5439  * Note that the ARP/IP merge should replace the functioanlity by providing
5440  * direct function calls to clean up unresolved entries in ire/nce lists.
5441  */
5442 void
5443 ire_freemblk(ire_t *ire_mp)
5444 {
5445 	nce_t		*nce = NULL;
5446 	ill_t		*ill;
5447 	ip_stack_t	*ipst;
5448 	netstack_t	*ns = NULL;
5449 
5450 	ASSERT(ire_mp != NULL);
5451 
5452 	if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) {
5453 		ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n",
5454 		    (void *)ire_mp));
5455 		goto cleanup;
5456 	}
5457 	if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) {
5458 		goto cleanup; /* everything succeeded. just free and return */
5459 	}
5460 
5461 	/*
5462 	 * the arp information corresponding to this ire_mp was not
5463 	 * transferred to an ire_cache entry. Need
5464 	 * to clean up incomplete ire's and nce, if necessary.
5465 	 */
5466 	ASSERT(ire_mp->ire_stq != NULL);
5467 	ASSERT(ire_mp->ire_stq_ifindex != 0);
5468 	ASSERT(ire_mp->ire_ipst != NULL);
5469 
5470 	ns = netstack_find_by_stackid(ire_mp->ire_stackid);
5471 	ipst = (ns ? ns->netstack_ip : NULL);
5472 	if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */
5473 		goto  cleanup;
5474 
5475 	/*
5476 	 * Get any nce's corresponding to this ire_mp. We first have to
5477 	 * make sure that the ill is still around.
5478 	 */
5479 	ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex,
5480 	    B_FALSE, NULL, NULL, NULL, NULL, ipst);
5481 	if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) ||
5482 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
5483 		/*
5484 		 * ill went away. no nce to clean up.
5485 		 * Note that the ill_state_flags could be set to
5486 		 * ILL_CONDEMNED after this point, but if we know
5487 		 * that it is CONDEMNED now, we just bail out quickly.
5488 		 */
5489 		if (ill != NULL)
5490 			ill_refrele(ill);
5491 		goto cleanup;
5492 	}
5493 	nce = ndp_lookup_v4(ill,
5494 	    ((ire_mp->ire_gateway_addr != INADDR_ANY) ?
5495 	    &ire_mp->ire_gateway_addr : &ire_mp->ire_addr),
5496 	    B_FALSE);
5497 	ill_refrele(ill);
5498 
5499 	if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) {
5500 		/*
5501 		 * some incomplete nce was found.
5502 		 */
5503 		DTRACE_PROBE2(ire__freemblk__arp__resolv__fail,
5504 		    nce_t *, nce, ire_t *, ire_mp);
5505 		/*
5506 		 * Send the icmp_unreachable messages for the queued mblks in
5507 		 * ire->ire_nce->nce_qd_mp, since ARP resolution failed
5508 		 * for this ire
5509 		 */
5510 		arp_resolv_failed(nce);
5511 		/*
5512 		 * Delete the nce and clean up all ire's pointing at this nce
5513 		 * in the cachetable
5514 		 */
5515 		ndp_delete(nce);
5516 	}
5517 	if (nce != NULL)
5518 		NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */
5519 
5520 cleanup:
5521 	if (ns != NULL)
5522 		netstack_rele(ns);
5523 	/*
5524 	 * Get rid of the ire buffer
5525 	 * We call kmem_free here(instead of ire_delete()), since
5526 	 * this is the freeb's callback.
5527 	 */
5528 	kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t));
5529 }
5530 
5531 /*
5532  * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and
5533  * non-loopback IRE_BROADCAST ire's.
5534  *
5535  * If a neighbor-cache entry has to be created (i.e., one does not already
5536  * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache
5537  * entry are initialized in ndp_add_v4(). These values are picked from
5538  * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the
5539  * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values
5540  * determine the {nce_state, nce_res_mp} of the nce_t created. All
5541  * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp
5542  * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire
5543  * entries,
5544  *   - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
5545  *     nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state.
5546  *   - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
5547  *     layer resolution is necessary, so that the nce_t will be in the
5548  *     ND_REACHABLE state and the nce_res_mp will have a copy of the
5549  *     ill_resolver_mp of the outgoing interface.
5550  *
5551  * The link layer information needed for broadcast addresses, and for
5552  * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
5553  * never needs re-verification for the lifetime of the nce_t. These are
5554  * therefore marked NCE_F_PERMANENT, and never allowed to expire via
5555  * NCE_EXPIRED.
5556  *
5557  * IRE_CACHE ire's contain the information for  the nexthop (ire_gateway_addr)
5558  * in the case of indirect routes, and for the dst itself (ire_addr) in the
5559  * case of direct routes, with the nce_res_mp containing a template
5560  * DL_UNITDATA request.
5561  *
5562  * The actual association of the ire_nce to the nce created here is
5563  * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions
5564  * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which
5565  * the ire_nce assignment is done in ire_add_then_send.
5566  */
5567 int
5568 ire_nce_init(ire_t *ire, nce_t *src_nce)
5569 {
5570 	in_addr_t	addr4;
5571 	int		err;
5572 	nce_t		*nce = NULL;
5573 	ill_t		*ire_ill;
5574 	uint16_t	nce_flags = 0;
5575 	ip_stack_t	*ipst;
5576 
5577 	if (ire->ire_stq == NULL)
5578 		return (0); /* no need to create nce for local/loopback */
5579 
5580 	switch (ire->ire_type) {
5581 	case IRE_CACHE:
5582 		if (ire->ire_gateway_addr != INADDR_ANY)
5583 			addr4 = ire->ire_gateway_addr; /* 'G' route */
5584 		else
5585 			addr4 = ire->ire_addr; /* direct route */
5586 		break;
5587 	case IRE_BROADCAST:
5588 		addr4 = ire->ire_addr;
5589 		nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST);
5590 		break;
5591 	default:
5592 		return (0);
5593 	}
5594 
5595 	/*
5596 	 * ire_ipif is picked based on RTF_SETSRC, usesrc etc.
5597 	 * rules in ire_forward_src_ipif. We want the dlureq_mp
5598 	 * for the outgoing interface, which we get from the ire_stq.
5599 	 */
5600 	ire_ill = ire_to_ill(ire);
5601 	ipst = ire_ill->ill_ipst;
5602 
5603 	/*
5604 	 * IRE_IF_NORESOLVER entries never need re-verification and
5605 	 * do not expire, so we mark them as NCE_F_PERMANENT.
5606 	 */
5607 	if (ire_ill->ill_net_type == IRE_IF_NORESOLVER)
5608 		nce_flags |= NCE_F_PERMANENT;
5609 
5610 retry_nce:
5611 	err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags,
5612 	    &nce, src_nce);
5613 
5614 	if (err == EEXIST && NCE_EXPIRED(nce, ipst)) {
5615 		/*
5616 		 * We looked up an expired nce.
5617 		 * Go back and try to create one again.
5618 		 */
5619 		ndp_delete(nce);
5620 		NCE_REFRELE(nce);
5621 		nce = NULL;
5622 		goto retry_nce;
5623 	}
5624 
5625 	ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n",
5626 	    (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err));
5627 
5628 	switch (err) {
5629 	case 0:
5630 	case EEXIST:
5631 		/*
5632 		 * return a pointer to a newly created or existing nce_t;
5633 		 * note that the ire-nce mapping is many-one, i.e.,
5634 		 * multiple ire's could point to the same nce_t.
5635 		 */
5636 		break;
5637 	default:
5638 		DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err);
5639 		return (EINVAL);
5640 	}
5641 	if (ire->ire_type == IRE_BROADCAST) {
5642 		/*
5643 		 * Two bcast ires are created for each interface;
5644 		 * 1. loopback copy (which does not  have an
5645 		 *    ire_stq, and therefore has no ire_nce), and,
5646 		 * 2. the non-loopback copy, which has the nce_res_mp
5647 		 *    initialized to a copy of the ill_bcast_mp, and
5648 		 *    is marked as ND_REACHABLE at this point.
5649 		 *    This nce does not undergo any further state changes,
5650 		 *    and exists as long as the interface is plumbed.
5651 		 * Note: the assignment of ire_nce here is a historical
5652 		 * artifact of old code that used to inline ire_add().
5653 		 */
5654 		ire->ire_nce = nce;
5655 		/*
5656 		 * We are associating this nce to the ire,
5657 		 * so change the nce ref taken in
5658 		 * ndp_lookup_then_add_v4() from
5659 		 * NCE_REFHOLD to NCE_REFHOLD_NOTR
5660 		 */
5661 		NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
5662 	} else {
5663 		/*
5664 		 * We are not using this nce_t just yet so release
5665 		 * the ref taken in ndp_lookup_then_add_v4()
5666 		 */
5667 		NCE_REFRELE(nce);
5668 	}
5669 	return (0);
5670 }
5671 
5672 /*
5673  * This is the implementation of the IPv4 IRE cache lookup procedure.
5674  * Separating the interface from the implementation allows additional
5675  * flexibility when specifying search criteria.
5676  */
5677 static ire_t *
5678 ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
5679 {
5680 	irb_t			*irb_ptr;
5681 	ire_t			*ire;
5682 	ip_stack_t		*ipst = margs->ict_ipst;
5683 
5684 	if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
5685 	    (margs->ict_ipif == NULL)) {
5686 		return (NULL);
5687 	}
5688 
5689 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
5690 	    *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)];
5691 	rw_enter(&irb_ptr->irb_lock, RW_READER);
5692 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
5693 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
5694 			continue;
5695 		ASSERT(ire->ire_mask == IP_HOST_MASK);
5696 		if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr),
5697 		    ire->ire_mask, *((ipaddr_t *)margs->ict_gateway),
5698 		    margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
5699 		    margs->ict_tsl, margs->ict_flags, margs->ict_wq)) {
5700 			IRE_REFHOLD(ire);
5701 			rw_exit(&irb_ptr->irb_lock);
5702 			return (ire);
5703 		}
5704 	}
5705 
5706 	rw_exit(&irb_ptr->irb_lock);
5707 	return (NULL);
5708 }
5709 
5710 /*
5711  * This function locates IRE_CACHE entries which were added by the
5712  * ire_forward() path. We can fully specify the IRE we are looking for by
5713  * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
5714  */
5715 ire_t *
5716 ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
5717     zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq)
5718 {
5719 	ire_ctable_args_t	margs;
5720 
5721 	margs.ict_addr = &addr;
5722 	margs.ict_gateway = &gw;
5723 	margs.ict_type = IRE_CACHE;
5724 	margs.ict_ipif = ipif;
5725 	margs.ict_zoneid = zoneid;
5726 	margs.ict_tsl = NULL;
5727 	margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY |
5728 	    MATCH_IRE_TYPE | MATCH_IRE_WQ;
5729 	margs.ict_ipst = ipst;
5730 	margs.ict_wq = wq;
5731 
5732 	return (ip4_ctable_lookup_impl(&margs));
5733 }
5734