xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_dce.c (revision 4ba5c7f8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/strsun.h>
29 #include <sys/zone.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/cmn_err.h>
33 #include <sys/debug.h>
34 #include <sys/atomic.h>
35 #define	_SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 
38 #include <inet/common.h>
39 #include <inet/mi.h>
40 #include <inet/mib2.h>
41 #include <inet/snmpcom.h>
42 
43 #include <netinet/ip6.h>
44 #include <netinet/icmp6.h>
45 
46 #include <inet/ip.h>
47 #include <inet/ip_impl.h>
48 #include <inet/ip6.h>
49 #include <inet/ip6_asp.h>
50 #include <inet/ip_multi.h>
51 #include <inet/ip_if.h>
52 #include <inet/ip_ire.h>
53 #include <inet/ip_ftable.h>
54 #include <inet/ip_rts.h>
55 #include <inet/ip_ndp.h>
56 #include <inet/ipclassifier.h>
57 #include <inet/ip_listutils.h>
58 
59 #include <sys/sunddi.h>
60 
61 /*
62  * Routines for handling destination cache entries.
63  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
64  * That entry holds both the IP ident value and the dce generation number.
65  *
66  * Any time a DCE is changed significantly (different path MTU, but NOT
67  * different ULP info!), the dce_generation number is increased.
68  * Also, when a new DCE is created, the dce_generation number in the default
69  * DCE is bumped. That allows the dce_t information to be cached efficiently
70  * as long as the entity caching the dce_t also caches the dce_generation,
71  * and compares the cached generation to detect any changes.
72  * Furthermore, when a DCE is deleted, if there are any outstanding references
73  * to the DCE it will be marked as condemned. The condemned mark is
74  * a designated generation number which is never otherwise used, hence
75  * the single comparison with the generation number captures that as well.
76  *
77  * An example of code which caches is as follows:
78  *
79  *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
80  *		The DCE has changed
81  *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
82  *		    &mystruct->my_dce_generation);
83  *		Not needed in practice, since we have the default DCE:
84  *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
85  *			return failure;
86  *	}
87  *
88  * Note that for IPv6 link-local addresses we record the ifindex since the
89  * link-locals are not globally unique.
90  */
91 
92 /*
93  * Hash bucket structure for DCEs
94  */
95 typedef struct dcb_s {
96 	krwlock_t	dcb_lock;
97 	uint32_t	dcb_cnt;
98 	dce_t		*dcb_dce;
99 } dcb_t;
100 
101 static void	dce_delete_locked(dcb_t *, dce_t *);
102 static void	dce_make_condemned(dce_t *);
103 
104 static kmem_cache_t *dce_cache;
105 
106 
107 /* Operates on a uint64_t */
108 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
109 
110 /*
111  * Reclaim a fraction of dce's in the dcb.
112  * For now we have a higher probability to delete DCEs without DCE_PMTU.
113  */
114 static void
115 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
116 {
117 	uint_t	fraction_pmtu = fraction*4;
118 	uint_t	hash;
119 	dce_t	*dce, *nextdce;
120 
121 	rw_enter(&dcb->dcb_lock, RW_WRITER);
122 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
123 		nextdce = dce->dce_next;
124 		/* Clear DCEF_PMTU if the pmtu is too old */
125 		mutex_enter(&dce->dce_lock);
126 		if ((dce->dce_flags & DCEF_PMTU) &&
127 		    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
128 		    ipst->ips_ip_pathmtu_interval) {
129 			dce->dce_flags &= ~DCEF_PMTU;
130 			mutex_exit(&dce->dce_lock);
131 			dce_increment_generation(dce);
132 		} else {
133 			mutex_exit(&dce->dce_lock);
134 		}
135 		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
136 		if (dce->dce_flags & DCEF_PMTU) {
137 			if (hash % fraction_pmtu != 0)
138 				continue;
139 		} else {
140 			if (hash % fraction != 0)
141 				continue;
142 		}
143 
144 		IP_STAT(ipst, ip_dce_reclaim_deleted);
145 		dce_delete_locked(dcb, dce);
146 		dce_refrele(dce);
147 	}
148 	rw_exit(&dcb->dcb_lock);
149 }
150 
151 /*
152  * kmem_cache callback to free up memory.
153  *
154  */
155 static void
156 ip_dce_reclaim_stack(ip_stack_t *ipst)
157 {
158 	int	i;
159 
160 	IP_STAT(ipst, ip_dce_reclaim_calls);
161 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
162 		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
163 		    ipst->ips_ip_dce_reclaim_fraction);
164 
165 		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
166 		    ipst->ips_ip_dce_reclaim_fraction);
167 	}
168 
169 	/*
170 	 * Walk all CONNs that can have a reference on an ire, nce or dce.
171 	 * Get them to update any stale references to drop any refholds they
172 	 * have.
173 	 */
174 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
175 }
176 
177 /*
178  * Called by the memory allocator subsystem directly, when the system
179  * is running low on memory.
180  */
181 /* ARGSUSED */
182 void
183 ip_dce_reclaim(void *args)
184 {
185 	netstack_handle_t nh;
186 	netstack_t *ns;
187 	ip_stack_t *ipst;
188 
189 	netstack_next_init(&nh);
190 	while ((ns = netstack_next(&nh)) != NULL) {
191 		/*
192 		 * netstack_next() can return a netstack_t with a NULL
193 		 * netstack_ip at boot time.
194 		 */
195 		if ((ipst = ns->netstack_ip) == NULL) {
196 			netstack_rele(ns);
197 			continue;
198 		}
199 		ip_dce_reclaim_stack(ipst);
200 		netstack_rele(ns);
201 	}
202 	netstack_next_fini(&nh);
203 }
204 
205 void
206 dce_g_init(void)
207 {
208 	dce_cache = kmem_cache_create("dce_cache",
209 	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
210 }
211 
212 void
213 dce_g_destroy(void)
214 {
215 	kmem_cache_destroy(dce_cache);
216 }
217 
218 
219 /*
220  * Allocate a default DCE and a hash table for per-IP address DCEs
221  */
222 void
223 dce_stack_init(ip_stack_t *ipst)
224 {
225 	int	i;
226 
227 	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
228 	bzero(ipst->ips_dce_default, sizeof (dce_t));
229 	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
230 	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
231 	ipst->ips_dce_default->dce_last_change_time =
232 	    TICK_TO_SEC(ddi_get_lbolt64());
233 	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
234 	ipst->ips_dce_default->dce_ipst = ipst;
235 
236 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
237 	ipst->ips_dce_hashsize = 256;
238 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
239 	    sizeof (dcb_t), KM_SLEEP);
240 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
241 	    sizeof (dcb_t), KM_SLEEP);
242 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
243 		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
244 		    NULL);
245 		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
246 		    NULL);
247 	}
248 }
249 
250 void
251 dce_stack_destroy(ip_stack_t *ipst)
252 {
253 	int i;
254 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
255 		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
256 		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
257 	}
258 	kmem_free(ipst->ips_dce_hash_v4,
259 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
260 	ipst->ips_dce_hash_v4 = NULL;
261 	kmem_free(ipst->ips_dce_hash_v6,
262 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
263 	ipst->ips_dce_hash_v6 = NULL;
264 	ipst->ips_dce_hashsize = 0;
265 
266 	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
267 	kmem_cache_free(dce_cache, ipst->ips_dce_default);
268 	ipst->ips_dce_default = NULL;
269 }
270 
271 /* When any DCE is good enough */
272 dce_t *
273 dce_get_default(ip_stack_t *ipst)
274 {
275 	dce_t		*dce;
276 
277 	dce = ipst->ips_dce_default;
278 	dce_refhold(dce);
279 	return (dce);
280 }
281 
282 /*
283  * Generic for IPv4 and IPv6.
284  *
285  * Used by callers that need to cache e.g., the datapath
286  * Returns the generation number in the last argument.
287  */
288 dce_t *
289 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
290 {
291 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
292 		/*
293 		 * If we have a source route we need to look for the final
294 		 * destination in the source route option.
295 		 */
296 		ipaddr_t final_dst;
297 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
298 
299 		final_dst = ip_get_dst(ipha);
300 		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
301 	} else {
302 		uint_t ifindex;
303 		/*
304 		 * If we have a routing header we need to look for the final
305 		 * destination in the routing extension header.
306 		 */
307 		in6_addr_t final_dst;
308 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
309 
310 		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
311 		ifindex = 0;
312 		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
313 			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
314 			    ill_phyint->phyint_ifindex;
315 		}
316 		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
317 		    generationp));
318 	}
319 }
320 
321 /*
322  * Used by callers that need to cache e.g., the datapath
323  * Returns the generation number in the last argument.
324  */
325 dce_t *
326 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
327 {
328 	uint_t		hash;
329 	dcb_t		*dcb;
330 	dce_t		*dce;
331 
332 	/* Set *generationp before dropping the lock(s) that allow additions */
333 	if (generationp != NULL)
334 		*generationp = ipst->ips_dce_default->dce_generation;
335 
336 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
337 	dcb = &ipst->ips_dce_hash_v4[hash];
338 	rw_enter(&dcb->dcb_lock, RW_READER);
339 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
340 		if (dce->dce_v4addr == dst) {
341 			mutex_enter(&dce->dce_lock);
342 			if (!DCE_IS_CONDEMNED(dce)) {
343 				dce_refhold(dce);
344 				if (generationp != NULL)
345 					*generationp = dce->dce_generation;
346 				mutex_exit(&dce->dce_lock);
347 				rw_exit(&dcb->dcb_lock);
348 				return (dce);
349 			}
350 			mutex_exit(&dce->dce_lock);
351 		}
352 	}
353 	rw_exit(&dcb->dcb_lock);
354 	/* Not found */
355 	dce = ipst->ips_dce_default;
356 	dce_refhold(dce);
357 	return (dce);
358 }
359 
360 /*
361  * Used by callers that need to cache e.g., the datapath
362  * Returns the generation number in the last argument.
363  * ifindex should only be set for link-locals
364  */
365 dce_t *
366 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
367     uint_t *generationp)
368 {
369 	uint_t		hash;
370 	dcb_t		*dcb;
371 	dce_t		*dce;
372 
373 	/* Set *generationp before dropping the lock(s) that allow additions */
374 	if (generationp != NULL)
375 		*generationp = ipst->ips_dce_default->dce_generation;
376 
377 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
378 	dcb = &ipst->ips_dce_hash_v6[hash];
379 	rw_enter(&dcb->dcb_lock, RW_READER);
380 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
381 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
382 		    dce->dce_ifindex == ifindex) {
383 			mutex_enter(&dce->dce_lock);
384 			if (!DCE_IS_CONDEMNED(dce)) {
385 				dce_refhold(dce);
386 				if (generationp != NULL)
387 					*generationp = dce->dce_generation;
388 				mutex_exit(&dce->dce_lock);
389 				rw_exit(&dcb->dcb_lock);
390 				return (dce);
391 			}
392 			mutex_exit(&dce->dce_lock);
393 		}
394 	}
395 	rw_exit(&dcb->dcb_lock);
396 	/* Not found */
397 	dce = ipst->ips_dce_default;
398 	dce_refhold(dce);
399 	return (dce);
400 }
401 
402 /*
403  * Atomically looks for a non-default DCE, and if not found tries to create one.
404  * If there is no memory it returns NULL.
405  * When an entry is created we increase the generation number on
406  * the default DCE so that conn_ip_output will detect there is a new DCE.
407  */
408 dce_t *
409 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
410 {
411 	uint_t		hash;
412 	dcb_t		*dcb;
413 	dce_t		*dce;
414 
415 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
416 	dcb = &ipst->ips_dce_hash_v4[hash];
417 	rw_enter(&dcb->dcb_lock, RW_WRITER);
418 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
419 		if (dce->dce_v4addr == dst) {
420 			mutex_enter(&dce->dce_lock);
421 			if (!DCE_IS_CONDEMNED(dce)) {
422 				dce_refhold(dce);
423 				mutex_exit(&dce->dce_lock);
424 				rw_exit(&dcb->dcb_lock);
425 				return (dce);
426 			}
427 			mutex_exit(&dce->dce_lock);
428 		}
429 	}
430 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
431 	if (dce == NULL) {
432 		rw_exit(&dcb->dcb_lock);
433 		return (NULL);
434 	}
435 	bzero(dce, sizeof (dce_t));
436 	dce->dce_ipst = ipst;	/* No netstack_hold */
437 	dce->dce_v4addr = dst;
438 	dce->dce_generation = DCE_GENERATION_INITIAL;
439 	dce->dce_ipversion = IPV4_VERSION;
440 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
441 	dce_refhold(dce);	/* For the hash list */
442 
443 	/* Link into list */
444 	if (dcb->dcb_dce != NULL)
445 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
446 	dce->dce_next = dcb->dcb_dce;
447 	dce->dce_ptpn = &dcb->dcb_dce;
448 	dcb->dcb_dce = dce;
449 	dce->dce_bucket = dcb;
450 	dce_refhold(dce);	/* For the caller */
451 	rw_exit(&dcb->dcb_lock);
452 
453 	/* Initialize dce_ident to be different than for the last packet */
454 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
455 
456 	dce_increment_generation(ipst->ips_dce_default);
457 	return (dce);
458 }
459 
460 /*
461  * Atomically looks for a non-default DCE, and if not found tries to create one.
462  * If there is no memory it returns NULL.
463  * When an entry is created we increase the generation number on
464  * the default DCE so that conn_ip_output will detect there is a new DCE.
465  * ifindex should only be used with link-local addresses.
466  */
467 dce_t *
468 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
469 {
470 	uint_t		hash;
471 	dcb_t		*dcb;
472 	dce_t		*dce;
473 
474 	/* We should not create entries for link-locals w/o an ifindex */
475 	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
476 
477 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
478 	dcb = &ipst->ips_dce_hash_v6[hash];
479 	rw_enter(&dcb->dcb_lock, RW_WRITER);
480 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
481 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
482 		    dce->dce_ifindex == ifindex) {
483 			mutex_enter(&dce->dce_lock);
484 			if (!DCE_IS_CONDEMNED(dce)) {
485 				dce_refhold(dce);
486 				mutex_exit(&dce->dce_lock);
487 				rw_exit(&dcb->dcb_lock);
488 				return (dce);
489 			}
490 			mutex_exit(&dce->dce_lock);
491 		}
492 	}
493 
494 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
495 	if (dce == NULL) {
496 		rw_exit(&dcb->dcb_lock);
497 		return (NULL);
498 	}
499 	bzero(dce, sizeof (dce_t));
500 	dce->dce_ipst = ipst;	/* No netstack_hold */
501 	dce->dce_v6addr = *dst;
502 	dce->dce_ifindex = ifindex;
503 	dce->dce_generation = DCE_GENERATION_INITIAL;
504 	dce->dce_ipversion = IPV6_VERSION;
505 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
506 	dce_refhold(dce);	/* For the hash list */
507 
508 	/* Link into list */
509 	if (dcb->dcb_dce != NULL)
510 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
511 	dce->dce_next = dcb->dcb_dce;
512 	dce->dce_ptpn = &dcb->dcb_dce;
513 	dcb->dcb_dce = dce;
514 	dce->dce_bucket = dcb;
515 	atomic_add_32(&dcb->dcb_cnt, 1);
516 	dce_refhold(dce);	/* For the caller */
517 	rw_exit(&dcb->dcb_lock);
518 
519 	/* Initialize dce_ident to be different than for the last packet */
520 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
521 	dce_increment_generation(ipst->ips_dce_default);
522 	return (dce);
523 }
524 
525 /*
526  * Set/update uinfo. Creates a per-destination dce if none exists.
527  *
528  * Note that we do not bump the generation number here.
529  * New connections will find the new uinfo.
530  *
531  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
532  */
533 static void
534 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
535 {
536 	/*
537 	 * Update the round trip time estimate and/or the max frag size
538 	 * and/or the slow start threshold.
539 	 *
540 	 * We serialize multiple advises using dce_lock.
541 	 */
542 	mutex_enter(&dce->dce_lock);
543 	/* Gard against setting to zero */
544 	if (uinfo->iulp_rtt != 0) {
545 		/*
546 		 * If there is no old cached values, initialize them
547 		 * conservatively.  Set them to be (1.5 * new value).
548 		 */
549 		if (dce->dce_uinfo.iulp_rtt != 0) {
550 			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
551 			    uinfo->iulp_rtt) >> 1;
552 		} else {
553 			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
554 			    (uinfo->iulp_rtt >> 1);
555 		}
556 		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
557 			dce->dce_uinfo.iulp_rtt_sd =
558 			    (dce->dce_uinfo.iulp_rtt_sd +
559 			    uinfo->iulp_rtt_sd) >> 1;
560 		} else {
561 			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
562 			    (uinfo->iulp_rtt_sd >> 1);
563 		}
564 	}
565 	if (uinfo->iulp_mtu != 0) {
566 		if (dce->dce_flags & DCEF_PMTU) {
567 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
568 		} else {
569 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
570 			dce->dce_flags |= DCEF_PMTU;
571 		}
572 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
573 	}
574 	if (uinfo->iulp_ssthresh != 0) {
575 		if (dce->dce_uinfo.iulp_ssthresh != 0)
576 			dce->dce_uinfo.iulp_ssthresh =
577 			    (uinfo->iulp_ssthresh +
578 			    dce->dce_uinfo.iulp_ssthresh) >> 1;
579 		else
580 			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
581 	}
582 	/* We have uinfo for sure */
583 	dce->dce_flags |= DCEF_UINFO;
584 	mutex_exit(&dce->dce_lock);
585 }
586 
587 
588 int
589 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
590 {
591 	dce_t *dce;
592 
593 	dce = dce_lookup_and_add_v4(dst, ipst);
594 	if (dce == NULL)
595 		return (ENOMEM);
596 
597 	dce_setuinfo(dce, uinfo);
598 	dce_refrele(dce);
599 	return (0);
600 }
601 
602 int
603 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
604     ip_stack_t *ipst)
605 {
606 	dce_t *dce;
607 
608 	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
609 	if (dce == NULL)
610 		return (ENOMEM);
611 
612 	dce_setuinfo(dce, uinfo);
613 	dce_refrele(dce);
614 	return (0);
615 }
616 
617 /* Common routine for IPv4 and IPv6 */
618 int
619 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
620     ip_stack_t *ipst)
621 {
622 	ipaddr_t dst4;
623 
624 	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
625 		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
626 		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
627 	} else {
628 		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
629 	}
630 }
631 
632 static void
633 dce_make_condemned(dce_t *dce)
634 {
635 	ip_stack_t	*ipst = dce->dce_ipst;
636 
637 	mutex_enter(&dce->dce_lock);
638 	ASSERT(!DCE_IS_CONDEMNED(dce));
639 	dce->dce_generation = DCE_GENERATION_CONDEMNED;
640 	mutex_exit(&dce->dce_lock);
641 	/* Count how many condemned dces for kmem_cache callback */
642 	atomic_add_32(&ipst->ips_num_dce_condemned, 1);
643 }
644 
645 /*
646  * Increment the generation avoiding the special condemned value
647  */
648 void
649 dce_increment_generation(dce_t *dce)
650 {
651 	uint_t generation;
652 
653 	mutex_enter(&dce->dce_lock);
654 	if (!DCE_IS_CONDEMNED(dce)) {
655 		generation = dce->dce_generation + 1;
656 		if (generation == DCE_GENERATION_CONDEMNED)
657 			generation = DCE_GENERATION_INITIAL;
658 		ASSERT(generation != DCE_GENERATION_VERIFY);
659 		dce->dce_generation = generation;
660 	}
661 	mutex_exit(&dce->dce_lock);
662 }
663 
664 /*
665  * Increment the generation number on all dces that have a path MTU and
666  * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
667  */
668 void
669 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
670 {
671 	int		i;
672 	dcb_t		*dcb;
673 	dce_t		*dce;
674 
675 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
676 		if (isv6)
677 			dcb = &ipst->ips_dce_hash_v6[i];
678 		else
679 			dcb = &ipst->ips_dce_hash_v4[i];
680 		rw_enter(&dcb->dcb_lock, RW_WRITER);
681 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
682 			if (DCE_IS_CONDEMNED(dce))
683 				continue;
684 			dce_increment_generation(dce);
685 		}
686 		rw_exit(&dcb->dcb_lock);
687 	}
688 	dce_increment_generation(ipst->ips_dce_default);
689 }
690 
691 /*
692  * Caller needs to do a dce_refrele since we can't do the
693  * dce_refrele under dcb_lock.
694  */
695 static void
696 dce_delete_locked(dcb_t *dcb, dce_t *dce)
697 {
698 	dce->dce_bucket = NULL;
699 	*dce->dce_ptpn = dce->dce_next;
700 	if (dce->dce_next != NULL)
701 		dce->dce_next->dce_ptpn = dce->dce_ptpn;
702 	dce->dce_ptpn = NULL;
703 	dce->dce_next = NULL;
704 	atomic_add_32(&dcb->dcb_cnt, -1);
705 	dce_make_condemned(dce);
706 }
707 
708 static void
709 dce_inactive(dce_t *dce)
710 {
711 	ip_stack_t	*ipst = dce->dce_ipst;
712 
713 	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
714 	ASSERT(dce->dce_ptpn == NULL);
715 	ASSERT(dce->dce_bucket == NULL);
716 
717 	/* Count how many condemned dces for kmem_cache callback */
718 	if (DCE_IS_CONDEMNED(dce))
719 		atomic_add_32(&ipst->ips_num_dce_condemned, -1);
720 
721 	kmem_cache_free(dce_cache, dce);
722 }
723 
724 void
725 dce_refrele(dce_t *dce)
726 {
727 	ASSERT(dce->dce_refcnt != 0);
728 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
729 		dce_inactive(dce);
730 }
731 
732 void
733 dce_refhold(dce_t *dce)
734 {
735 	atomic_add_32(&dce->dce_refcnt, 1);
736 	ASSERT(dce->dce_refcnt != 0);
737 }
738 
739 /* No tracing support yet hence the same as the above functions */
740 void
741 dce_refrele_notr(dce_t *dce)
742 {
743 	ASSERT(dce->dce_refcnt != 0);
744 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
745 		dce_inactive(dce);
746 }
747 
748 void
749 dce_refhold_notr(dce_t *dce)
750 {
751 	atomic_add_32(&dce->dce_refcnt, 1);
752 	ASSERT(dce->dce_refcnt != 0);
753 }
754 
755 /* Report both the IPv4 and IPv6 DCEs. */
756 mblk_t *
757 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
758 {
759 	struct opthdr		*optp;
760 	mblk_t			*mp2ctl;
761 	dest_cache_entry_t	dest_cache;
762 	mblk_t			*mp_tail = NULL;
763 	dce_t			*dce;
764 	dcb_t			*dcb;
765 	int			i;
766 	uint64_t		current_time;
767 
768 	current_time = TICK_TO_SEC(ddi_get_lbolt64());
769 
770 	/*
771 	 * make a copy of the original message
772 	 */
773 	mp2ctl = copymsg(mpctl);
774 
775 	/* First we do IPv4 entries */
776 	optp = (struct opthdr *)&mpctl->b_rptr[
777 	    sizeof (struct T_optmgmt_ack)];
778 	optp->level = MIB2_IP;
779 	optp->name = EXPER_IP_DCE;
780 
781 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
782 		dcb = &ipst->ips_dce_hash_v4[i];
783 		rw_enter(&dcb->dcb_lock, RW_READER);
784 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
785 			dest_cache.DestIpv4Address = dce->dce_v4addr;
786 			dest_cache.DestFlags = dce->dce_flags;
787 			if (dce->dce_flags & DCEF_PMTU)
788 				dest_cache.DestPmtu = dce->dce_pmtu;
789 			else
790 				dest_cache.DestPmtu = 0;
791 			dest_cache.DestIdent = dce->dce_ident;
792 			dest_cache.DestIfindex = 0;
793 			dest_cache.DestAge = current_time -
794 			    dce->dce_last_change_time;
795 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
796 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
797 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
798 				    "failed to allocate %u bytes\n",
799 				    (uint_t)sizeof (dest_cache)));
800 			}
801 		}
802 		rw_exit(&dcb->dcb_lock);
803 	}
804 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
805 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
806 	    (int)optp->level, (int)optp->name, (int)optp->len));
807 	qreply(q, mpctl);
808 
809 	if (mp2ctl == NULL) {
810 		/* Copymsg failed above */
811 		return (NULL);
812 	}
813 
814 	/* Now for IPv6 */
815 	mpctl = mp2ctl;
816 	mp_tail = NULL;
817 	mp2ctl = copymsg(mpctl);
818 	optp = (struct opthdr *)&mpctl->b_rptr[
819 	    sizeof (struct T_optmgmt_ack)];
820 	optp->level = MIB2_IP6;
821 	optp->name = EXPER_IP_DCE;
822 
823 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
824 		dcb = &ipst->ips_dce_hash_v6[i];
825 		rw_enter(&dcb->dcb_lock, RW_READER);
826 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
827 			dest_cache.DestIpv6Address = dce->dce_v6addr;
828 			dest_cache.DestFlags = dce->dce_flags;
829 			if (dce->dce_flags & DCEF_PMTU)
830 				dest_cache.DestPmtu = dce->dce_pmtu;
831 			else
832 				dest_cache.DestPmtu = 0;
833 			dest_cache.DestIdent = dce->dce_ident;
834 			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
835 				dest_cache.DestIfindex = dce->dce_ifindex;
836 			else
837 				dest_cache.DestIfindex = 0;
838 			dest_cache.DestAge = current_time -
839 			    dce->dce_last_change_time;
840 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
841 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
842 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
843 				    "failed to allocate %u bytes\n",
844 				    (uint_t)sizeof (dest_cache)));
845 			}
846 		}
847 		rw_exit(&dcb->dcb_lock);
848 	}
849 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
850 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
851 	    (int)optp->level, (int)optp->name, (int)optp->len));
852 	qreply(q, mpctl);
853 
854 	return (mp2ctl);
855 }
856 
857 /*
858  * Remove IPv6 DCEs which refer to an ifindex that is going away.
859  * This is not required for correctness, but it avoids netstat -d
860  * showing stale stuff that will never be used.
861  */
862 void
863 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
864 {
865 	uint_t	i;
866 	dcb_t	*dcb;
867 	dce_t	*dce, *nextdce;
868 
869 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
870 		dcb = &ipst->ips_dce_hash_v6[i];
871 		rw_enter(&dcb->dcb_lock, RW_WRITER);
872 
873 		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
874 			nextdce = dce->dce_next;
875 			if (dce->dce_ifindex == ifindex) {
876 				dce_delete_locked(dcb, dce);
877 				dce_refrele(dce);
878 			}
879 		}
880 		rw_exit(&dcb->dcb_lock);
881 	}
882 }
883