xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision 57c40785)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 
68 /*
69  * Function names with nce_ prefix are static while function
70  * names with ndp_ prefix are used by rest of the IP.
71  *
72  * Lock ordering:
73  *
74  *	ndp_g_lock -> ill_lock -> nce_lock
75  *
76  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
77  * nce_next.  Nce_lock protects the contents of the NCE (particularly
78  * nce_refcnt).
79  */
80 
81 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
82     uint32_t ll_addr_len);
83 static	void	nce_ire_delete(nce_t *nce);
84 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
85 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
86 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
87 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
88 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
89     uchar_t *addr);
90 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
91 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
92 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
93 static	void	nce_update(nce_t *nce, uint16_t new_state,
94     uchar_t *new_ll_addr);
95 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
96 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
97     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
98     const in6_addr_t *target, int flag);
99 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
100     nce_t **, nce_t *);
101 
102 #ifdef DEBUG
103 static void	nce_trace_cleanup(const nce_t *);
104 #endif
105 
106 #define	NCE_HASH_PTR_V4(ipst, addr)					\
107 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
108 
109 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
110 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
111 		NCE_TABLE_SIZE)]))
112 
113 /*
114  * Compute default flags to use for an advertisement of this nce's address.
115  */
116 static int
117 nce_advert_flags(const nce_t *nce)
118 {
119 	int flag = 0;
120 
121 	if (nce->nce_flags & NCE_F_ISROUTER)
122 		flag |= NDP_ISROUTER;
123 	if (!(nce->nce_flags & NCE_F_ANYCAST))
124 		flag |= NDP_ORIDE;
125 
126 	return (flag);
127 }
128 
129 /* Non-tunable probe interval, based on link capabilities */
130 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
131 
132 /*
133  * NDP Cache Entry creation routine.
134  * Mapped entries will never do NUD .
135  * This routine must always be called with ndp6->ndp_g_lock held.
136  * Prior to return, nce_refcnt is incremented.
137  */
138 int
139 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
140     const in6_addr_t *mask, const in6_addr_t *extract_mask,
141     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
142     nce_t **newnce)
143 {
144 	static	nce_t		nce_nil;
145 	nce_t		*nce;
146 	mblk_t		*mp;
147 	mblk_t		*template;
148 	nce_t		**ncep;
149 	int		err;
150 	boolean_t	dropped = B_FALSE;
151 	ip_stack_t	*ipst = ill->ill_ipst;
152 
153 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
154 	ASSERT(ill != NULL && ill->ill_isv6);
155 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
156 		ip0dbg(("ndp_add_v6: no addr\n"));
157 		return (EINVAL);
158 	}
159 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
160 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
161 		return (EINVAL);
162 	}
163 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
164 	    (flags & NCE_F_MAPPING)) {
165 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
166 		return (EINVAL);
167 	}
168 	/*
169 	 * Allocate the mblk to hold the nce.
170 	 *
171 	 * XXX This can come out of a separate cache - nce_cache.
172 	 * We don't need the mp anymore as there are no more
173 	 * "qwriter"s
174 	 */
175 	mp = allocb(sizeof (nce_t), BPRI_MED);
176 	if (mp == NULL)
177 		return (ENOMEM);
178 
179 	nce = (nce_t *)mp->b_rptr;
180 	mp->b_wptr = (uchar_t *)&nce[1];
181 	*nce = nce_nil;
182 
183 	/*
184 	 * This one holds link layer address
185 	 */
186 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
187 		template = nce_udreq_alloc(ill);
188 	} else {
189 		if (ill->ill_resolver_mp == NULL) {
190 			freeb(mp);
191 			return (EINVAL);
192 		}
193 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
194 		template = copyb(ill->ill_resolver_mp);
195 	}
196 	if (template == NULL) {
197 		freeb(mp);
198 		return (ENOMEM);
199 	}
200 	nce->nce_ill = ill;
201 	nce->nce_ipversion = IPV6_VERSION;
202 	nce->nce_flags = flags;
203 	nce->nce_state = state;
204 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
205 	nce->nce_rcnt = ill->ill_xmit_count;
206 	nce->nce_addr = *addr;
207 	nce->nce_mask = *mask;
208 	nce->nce_extract_mask = *extract_mask;
209 	nce->nce_ll_extract_start = hw_extract_start;
210 	nce->nce_fp_mp = NULL;
211 	nce->nce_res_mp = template;
212 	if (state == ND_REACHABLE)
213 		nce->nce_last = TICK_TO_MSEC(lbolt64);
214 	else
215 		nce->nce_last = 0;
216 	nce->nce_qd_mp = NULL;
217 	nce->nce_mp = mp;
218 	if (hw_addr != NULL)
219 		nce_set_ll(nce, hw_addr);
220 	/* This one is for nce getting created */
221 	nce->nce_refcnt = 1;
222 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
223 	if (nce->nce_flags & NCE_F_MAPPING) {
224 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
225 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
226 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
227 		ncep = &ipst->ips_ndp6->nce_mask_entries;
228 	} else {
229 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
230 	}
231 
232 	nce->nce_trace_disable = B_FALSE;
233 
234 	/*
235 	 * Atomically ensure that the ill is not CONDEMNED, before
236 	 * adding the NCE.
237 	 */
238 	mutex_enter(&ill->ill_lock);
239 	if (ill->ill_state_flags & ILL_CONDEMNED) {
240 		mutex_exit(&ill->ill_lock);
241 		freeb(mp);
242 		freeb(template);
243 		return (EINVAL);
244 	}
245 	if ((nce->nce_next = *ncep) != NULL)
246 		nce->nce_next->nce_ptpn = &nce->nce_next;
247 	*ncep = nce;
248 	nce->nce_ptpn = ncep;
249 	*newnce = nce;
250 	/* This one is for nce being used by an active thread */
251 	NCE_REFHOLD(*newnce);
252 
253 	/* Bump up the number of nce's referencing this ill */
254 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
255 	    (char *), "nce", (void *), nce);
256 	ill->ill_nce_cnt++;
257 	mutex_exit(&ill->ill_lock);
258 
259 	err = 0;
260 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
261 		mutex_enter(&nce->nce_lock);
262 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
263 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
264 		mutex_exit(&nce->nce_lock);
265 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
266 		    &ipv6_all_zeros, addr, NDP_PROBE);
267 		if (dropped) {
268 			mutex_enter(&nce->nce_lock);
269 			nce->nce_pcnt++;
270 			mutex_exit(&nce->nce_lock);
271 		}
272 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
273 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
274 		err = EINPROGRESS;
275 	} else if (flags & NCE_F_UNSOL_ADV) {
276 		/*
277 		 * We account for the transmit below by assigning one
278 		 * less than the ndd variable. Subsequent decrements
279 		 * are done in ndp_timer.
280 		 */
281 		mutex_enter(&nce->nce_lock);
282 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
283 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
284 		mutex_exit(&nce->nce_lock);
285 		dropped = nce_xmit(ill,
286 		    ND_NEIGHBOR_ADVERT,
287 		    ill,	/* ill to be used for extracting ill_nd_lla */
288 		    B_TRUE,	/* use ill_nd_lla */
289 		    addr,	/* Source and target of the advertisement pkt */
290 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
291 		    nce_advert_flags(nce));
292 		mutex_enter(&nce->nce_lock);
293 		if (dropped)
294 			nce->nce_unsolicit_count++;
295 		if (nce->nce_unsolicit_count != 0) {
296 			nce->nce_timeout_id = timeout(ndp_timer, nce,
297 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
298 		}
299 		mutex_exit(&nce->nce_lock);
300 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
301 	}
302 	/*
303 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
304 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
305 	 * We call nce_fastpath from nce_update if the link layer address of
306 	 * the peer changes from nce_update
307 	 */
308 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
309 		nce_fastpath(nce);
310 	return (err);
311 }
312 
313 int
314 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
315     const in6_addr_t *mask, const in6_addr_t *extract_mask,
316     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
317     nce_t **newnce)
318 {
319 	int	err = 0;
320 	nce_t	*nce;
321 	ip_stack_t	*ipst = ill->ill_ipst;
322 
323 	ASSERT(ill->ill_isv6);
324 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
325 
326 	/* Get head of v6 hash table */
327 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
328 	nce = nce_lookup_addr(ill, addr, nce);
329 	if (nce == NULL) {
330 		err = ndp_add_v6(ill,
331 		    hw_addr,
332 		    addr,
333 		    mask,
334 		    extract_mask,
335 		    hw_extract_start,
336 		    flags,
337 		    state,
338 		    newnce);
339 	} else {
340 		*newnce = nce;
341 		err = EEXIST;
342 	}
343 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
344 	return (err);
345 }
346 
347 /*
348  * Remove all the CONDEMNED nces from the appropriate hash table.
349  * We create a private list of NCEs, these may have ires pointing
350  * to them, so the list will be passed through to clean up dependent
351  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
352  */
353 static void
354 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
355 {
356 	nce_t *nce1;
357 	nce_t **ptpn;
358 
359 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
360 	ASSERT(ndp->ndp_g_walker == 0);
361 	for (; nce; nce = nce1) {
362 		nce1 = nce->nce_next;
363 		mutex_enter(&nce->nce_lock);
364 		if (nce->nce_flags & NCE_F_CONDEMNED) {
365 			ptpn = nce->nce_ptpn;
366 			nce1 = nce->nce_next;
367 			if (nce1 != NULL)
368 				nce1->nce_ptpn = ptpn;
369 			*ptpn = nce1;
370 			nce->nce_ptpn = NULL;
371 			nce->nce_next = NULL;
372 			nce->nce_next = *free_nce_list;
373 			*free_nce_list = nce;
374 		}
375 		mutex_exit(&nce->nce_lock);
376 	}
377 }
378 
379 /*
380  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
381  *    will return this NCE. Also no new IREs will be created that
382  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
383  *    be started (See NDP_RESTART_TIMER).
384  * 2. Cancel any currently running timeouts.
385  * 3. If there is an ndp walker, return. The walker will do the cleanup.
386  *    This ensures that walkers see a consistent list of NCEs while walking.
387  * 4. Otherwise remove the NCE from the list of NCEs
388  * 5. Delete all IREs pointing to this NCE.
389  */
390 void
391 ndp_delete(nce_t *nce)
392 {
393 	nce_t	**ptpn;
394 	nce_t	*nce1;
395 	int	ipversion = nce->nce_ipversion;
396 	ndp_g_t *ndp;
397 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
398 
399 	if (ipversion == IPV4_VERSION)
400 		ndp = ipst->ips_ndp4;
401 	else
402 		ndp = ipst->ips_ndp6;
403 
404 	/* Serialize deletes */
405 	mutex_enter(&nce->nce_lock);
406 	if (nce->nce_flags & NCE_F_CONDEMNED) {
407 		/* Some other thread is doing the delete */
408 		mutex_exit(&nce->nce_lock);
409 		return;
410 	}
411 	/*
412 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
413 	 * refcnt has to be >= 2
414 	 */
415 	ASSERT(nce->nce_refcnt >= 2);
416 	nce->nce_flags |= NCE_F_CONDEMNED;
417 	mutex_exit(&nce->nce_lock);
418 
419 	nce_fastpath_list_delete(nce);
420 
421 	/*
422 	 * Cancel any running timer. Timeout can't be restarted
423 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
424 	 * Passing invalid timeout id is fine.
425 	 */
426 	if (nce->nce_timeout_id != 0) {
427 		(void) untimeout(nce->nce_timeout_id);
428 		nce->nce_timeout_id = 0;
429 	}
430 
431 	mutex_enter(&ndp->ndp_g_lock);
432 	if (nce->nce_ptpn == NULL) {
433 		/*
434 		 * The last ndp walker has already removed this nce from
435 		 * the list after we marked the nce CONDEMNED and before
436 		 * we grabbed the global lock.
437 		 */
438 		mutex_exit(&ndp->ndp_g_lock);
439 		return;
440 	}
441 	if (ndp->ndp_g_walker > 0) {
442 		/*
443 		 * Can't unlink. The walker will clean up
444 		 */
445 		ndp->ndp_g_walker_cleanup = B_TRUE;
446 		mutex_exit(&ndp->ndp_g_lock);
447 		return;
448 	}
449 
450 	/*
451 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
452 	 * the timer since it is marked CONDEMNED.
453 	 */
454 	ptpn = nce->nce_ptpn;
455 	nce1 = nce->nce_next;
456 	if (nce1 != NULL)
457 		nce1->nce_ptpn = ptpn;
458 	*ptpn = nce1;
459 	nce->nce_ptpn = NULL;
460 	nce->nce_next = NULL;
461 	mutex_exit(&ndp->ndp_g_lock);
462 
463 	nce_ire_delete(nce);
464 }
465 
466 void
467 ndp_inactive(nce_t *nce)
468 {
469 	mblk_t		**mpp;
470 	ill_t		*ill;
471 
472 	ASSERT(nce->nce_refcnt == 0);
473 	ASSERT(MUTEX_HELD(&nce->nce_lock));
474 	ASSERT(nce->nce_fastpath == NULL);
475 
476 	/* Free all nce allocated messages */
477 	mpp = &nce->nce_first_mp_to_free;
478 	do {
479 		while (*mpp != NULL) {
480 			mblk_t  *mp;
481 
482 			mp = *mpp;
483 			*mpp = mp->b_next;
484 
485 			inet_freemsg(mp);
486 		}
487 	} while (mpp++ != &nce->nce_last_mp_to_free);
488 
489 #ifdef DEBUG
490 	nce_trace_cleanup(nce);
491 #endif
492 
493 	ill = nce->nce_ill;
494 	mutex_enter(&ill->ill_lock);
495 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
496 	    (char *), "nce", (void *), nce);
497 	ill->ill_nce_cnt--;
498 	/*
499 	 * If the number of nce's associated with this ill have dropped
500 	 * to zero, check whether we need to restart any operation that
501 	 * is waiting for this to happen.
502 	 */
503 	if (ILL_DOWN_OK(ill)) {
504 		/* ipif_ill_refrele_tail drops the ill_lock */
505 		ipif_ill_refrele_tail(ill);
506 	} else {
507 		mutex_exit(&ill->ill_lock);
508 	}
509 	mutex_destroy(&nce->nce_lock);
510 	if (nce->nce_mp != NULL)
511 		inet_freemsg(nce->nce_mp);
512 }
513 
514 /*
515  * ndp_walk routine.  Delete the nce if it is associated with the ill
516  * that is going away.  Always called as a writer.
517  */
518 void
519 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
520 {
521 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
522 		ndp_delete(nce);
523 	}
524 }
525 
526 /*
527  * Walk a list of to be inactive NCEs and blow away all the ires.
528  */
529 static void
530 nce_ire_delete_list(nce_t *nce)
531 {
532 	nce_t *nce_next;
533 
534 	ASSERT(nce != NULL);
535 	while (nce != NULL) {
536 		nce_next = nce->nce_next;
537 		nce->nce_next = NULL;
538 
539 		/*
540 		 * It is possible for the last ndp walker (this thread)
541 		 * to come here after ndp_delete has marked the nce CONDEMNED
542 		 * and before it has removed the nce from the fastpath list
543 		 * or called untimeout. So we need to do it here. It is safe
544 		 * for both ndp_delete and this thread to do it twice or
545 		 * even simultaneously since each of the threads has a
546 		 * reference on the nce.
547 		 */
548 		nce_fastpath_list_delete(nce);
549 		/*
550 		 * Cancel any running timer. Timeout can't be restarted
551 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
552 		 * Passing invalid timeout id is fine.
553 		 */
554 		if (nce->nce_timeout_id != 0) {
555 			(void) untimeout(nce->nce_timeout_id);
556 			nce->nce_timeout_id = 0;
557 		}
558 		/*
559 		 * We might hit this func thus in the v4 case:
560 		 * ipif_down->ipif_ndp_down->ndp_walk
561 		 */
562 
563 		if (nce->nce_ipversion == IPV4_VERSION) {
564 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
565 			    IRE_CACHE, nce_ire_delete1,
566 			    (char *)nce, nce->nce_ill);
567 		} else {
568 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
569 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
570 			    IRE_CACHE, nce_ire_delete1,
571 			    (char *)nce, nce->nce_ill);
572 		}
573 		NCE_REFRELE_NOTR(nce);
574 		nce = nce_next;
575 	}
576 }
577 
578 /*
579  * Delete an ire when the nce goes away.
580  */
581 /* ARGSUSED */
582 static void
583 nce_ire_delete(nce_t *nce)
584 {
585 	if (nce->nce_ipversion == IPV6_VERSION) {
586 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
587 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
588 		NCE_REFRELE_NOTR(nce);
589 	} else {
590 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
591 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
592 		NCE_REFRELE_NOTR(nce);
593 	}
594 }
595 
596 /*
597  * ire_walk routine used to delete every IRE that shares this nce
598  */
599 static void
600 nce_ire_delete1(ire_t *ire, char *nce_arg)
601 {
602 	nce_t	*nce = (nce_t *)nce_arg;
603 
604 	ASSERT(ire->ire_type == IRE_CACHE);
605 
606 	if (ire->ire_nce == nce) {
607 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
608 		ire_delete(ire);
609 	}
610 }
611 
612 /*
613  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
614  */
615 boolean_t
616 ndp_restart_dad(nce_t *nce)
617 {
618 	boolean_t started;
619 	boolean_t dropped;
620 
621 	if (nce == NULL)
622 		return (B_FALSE);
623 	mutex_enter(&nce->nce_lock);
624 	if (nce->nce_state == ND_PROBE) {
625 		mutex_exit(&nce->nce_lock);
626 		started = B_TRUE;
627 	} else if (nce->nce_state == ND_REACHABLE) {
628 		nce->nce_state = ND_PROBE;
629 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
630 		mutex_exit(&nce->nce_lock);
631 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
632 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
633 		if (dropped) {
634 			mutex_enter(&nce->nce_lock);
635 			nce->nce_pcnt++;
636 			mutex_exit(&nce->nce_lock);
637 		}
638 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
639 		started = B_TRUE;
640 	} else {
641 		mutex_exit(&nce->nce_lock);
642 		started = B_FALSE;
643 	}
644 	return (started);
645 }
646 
647 /*
648  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
649  * If one is found, the refcnt on the nce will be incremented.
650  */
651 nce_t *
652 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
653 {
654 	nce_t	*nce;
655 	ip_stack_t	*ipst;
656 
657 	ASSERT(ill != NULL);
658 	ipst = ill->ill_ipst;
659 
660 	ASSERT(ill != NULL && ill->ill_isv6);
661 	if (!caller_holds_lock) {
662 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
663 	}
664 
665 	/* Get head of v6 hash table */
666 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
667 	nce = nce_lookup_addr(ill, addr, nce);
668 	if (nce == NULL)
669 		nce = nce_lookup_mapping(ill, addr);
670 	if (!caller_holds_lock)
671 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
672 	return (nce);
673 }
674 /*
675  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
676  * If one is found, the refcnt on the nce will be incremented.
677  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
678  * so we skip the nce_lookup_mapping call.
679  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
680  */
681 nce_t *
682 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
683 {
684 	nce_t	*nce;
685 	in6_addr_t addr6;
686 	ip_stack_t *ipst = ill->ill_ipst;
687 
688 	if (!caller_holds_lock) {
689 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
690 	}
691 
692 	/* Get head of v4 hash table */
693 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
694 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
695 	nce = nce_lookup_addr(ill, &addr6, nce);
696 	if (!caller_holds_lock)
697 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
698 	return (nce);
699 }
700 
701 /*
702  * Cache entry lookup.  Try to find an nce matching the parameters passed.
703  * Look only for exact entries (no mappings).  If an nce is found, increment
704  * the hold count on that nce. The caller passes in the start of the
705  * appropriate hash table, and must be holding the appropriate global
706  * lock (ndp_g_lock).
707  */
708 static nce_t *
709 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
710 {
711 	ndp_g_t		*ndp;
712 	ip_stack_t	*ipst = ill->ill_ipst;
713 
714 	if (ill->ill_isv6)
715 		ndp = ipst->ips_ndp6;
716 	else
717 		ndp = ipst->ips_ndp4;
718 
719 	ASSERT(ill != NULL);
720 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
721 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
722 		return (NULL);
723 	for (; nce != NULL; nce = nce->nce_next) {
724 		if (nce->nce_ill == ill) {
725 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
726 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
727 			    &ipv6_all_ones)) {
728 				mutex_enter(&nce->nce_lock);
729 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
730 					NCE_REFHOLD_LOCKED(nce);
731 					mutex_exit(&nce->nce_lock);
732 					break;
733 				}
734 				mutex_exit(&nce->nce_lock);
735 			}
736 		}
737 	}
738 	return (nce);
739 }
740 
741 /*
742  * Cache entry lookup.  Try to find an nce matching the parameters passed.
743  * Look only for mappings.
744  */
745 static nce_t *
746 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
747 {
748 	nce_t	*nce;
749 	ip_stack_t	*ipst = ill->ill_ipst;
750 
751 	ASSERT(ill != NULL && ill->ill_isv6);
752 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
753 	if (!IN6_IS_ADDR_MULTICAST(addr))
754 		return (NULL);
755 	nce = ipst->ips_ndp6->nce_mask_entries;
756 	for (; nce != NULL; nce = nce->nce_next)
757 		if (nce->nce_ill == ill &&
758 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
759 			mutex_enter(&nce->nce_lock);
760 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
761 				NCE_REFHOLD_LOCKED(nce);
762 				mutex_exit(&nce->nce_lock);
763 				break;
764 			}
765 			mutex_exit(&nce->nce_lock);
766 		}
767 	return (nce);
768 }
769 
770 /*
771  * Process passed in parameters either from an incoming packet or via
772  * user ioctl.
773  */
774 void
775 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
776 {
777 	ill_t	*ill = nce->nce_ill;
778 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
779 	mblk_t	*mp;
780 	boolean_t ll_updated = B_FALSE;
781 	boolean_t ll_changed;
782 	ip_stack_t	*ipst = ill->ill_ipst;
783 
784 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
785 	/*
786 	 * No updates of link layer address or the neighbor state is
787 	 * allowed, when the cache is in NONUD state.  This still
788 	 * allows for responding to reachability solicitation.
789 	 */
790 	mutex_enter(&nce->nce_lock);
791 	if (nce->nce_state == ND_INCOMPLETE) {
792 		if (hw_addr == NULL) {
793 			mutex_exit(&nce->nce_lock);
794 			return;
795 		}
796 		nce_set_ll(nce, hw_addr);
797 		/*
798 		 * Update nce state and send the queued packets
799 		 * back to ip this time ire will be added.
800 		 */
801 		if (flag & ND_NA_FLAG_SOLICITED) {
802 			nce_update(nce, ND_REACHABLE, NULL);
803 		} else {
804 			nce_update(nce, ND_STALE, NULL);
805 		}
806 		mutex_exit(&nce->nce_lock);
807 		nce_fastpath(nce);
808 		mutex_enter(&nce->nce_lock);
809 		mp = nce->nce_qd_mp;
810 		nce->nce_qd_mp = NULL;
811 		mutex_exit(&nce->nce_lock);
812 		while (mp != NULL) {
813 			mblk_t *nxt_mp, *data_mp;
814 
815 			nxt_mp = mp->b_next;
816 			mp->b_next = NULL;
817 
818 			if (mp->b_datap->db_type == M_CTL)
819 				data_mp = mp->b_cont;
820 			else
821 				data_mp = mp;
822 			if (data_mp->b_prev != NULL) {
823 				ill_t   *inbound_ill;
824 				queue_t *fwdq = NULL;
825 				uint_t ifindex;
826 
827 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
828 				inbound_ill = ill_lookup_on_ifindex(ifindex,
829 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
830 				if (inbound_ill == NULL) {
831 					data_mp->b_prev = NULL;
832 					freemsg(mp);
833 					return;
834 				} else {
835 					fwdq = inbound_ill->ill_rq;
836 				}
837 				data_mp->b_prev = NULL;
838 				/*
839 				 * Send a forwarded packet back into ip_rput_v6
840 				 * just as in ire_send_v6().
841 				 * Extract the queue from b_prev (set in
842 				 * ip_rput_data_v6).
843 				 */
844 				if (fwdq != NULL) {
845 					/*
846 					 * Forwarded packets hop count will
847 					 * get decremented in ip_rput_data_v6
848 					 */
849 					if (data_mp != mp)
850 						freeb(mp);
851 					put(fwdq, data_mp);
852 				} else {
853 					/*
854 					 * Send locally originated packets back
855 					 * into * ip_wput_v6.
856 					 */
857 					put(ill->ill_wq, mp);
858 				}
859 				ill_refrele(inbound_ill);
860 			} else {
861 				put(ill->ill_wq, mp);
862 			}
863 			mp = nxt_mp;
864 		}
865 		return;
866 	}
867 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
868 	if (!is_adv) {
869 		/* If this is a SOLICITATION request only */
870 		if (ll_changed)
871 			nce_update(nce, ND_STALE, hw_addr);
872 		mutex_exit(&nce->nce_lock);
873 		return;
874 	}
875 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
876 		/* If in any other state than REACHABLE, ignore */
877 		if (nce->nce_state == ND_REACHABLE) {
878 			nce_update(nce, ND_STALE, NULL);
879 		}
880 		mutex_exit(&nce->nce_lock);
881 		return;
882 	} else {
883 		if (ll_changed) {
884 			nce_update(nce, ND_UNCHANGED, hw_addr);
885 			ll_updated = B_TRUE;
886 		}
887 		if (flag & ND_NA_FLAG_SOLICITED) {
888 			nce_update(nce, ND_REACHABLE, NULL);
889 		} else {
890 			if (ll_updated) {
891 				nce_update(nce, ND_STALE, NULL);
892 			}
893 		}
894 		mutex_exit(&nce->nce_lock);
895 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
896 		    NCE_F_ISROUTER)) {
897 			ire_t *ire;
898 
899 			/*
900 			 * Router turned to host.  We need to remove the
901 			 * entry as well as any default route that may be
902 			 * using this as a next hop.  This is required by
903 			 * section 7.2.5 of RFC 2461.
904 			 */
905 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
906 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
907 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
908 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
909 			    MATCH_IRE_DEFAULT, ipst);
910 			if (ire != NULL) {
911 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
912 				ire_delete(ire);
913 				ire_refrele(ire);
914 			}
915 			ndp_delete(nce);
916 		}
917 	}
918 }
919 
920 /*
921  * Pass arg1 to the pfi supplied, along with each nce in existence.
922  * ndp_walk() places a REFHOLD on the nce and drops the lock when
923  * walking the hash list.
924  */
925 void
926 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
927     boolean_t trace)
928 {
929 
930 	nce_t	*nce;
931 	nce_t	*nce1;
932 	nce_t	**ncep;
933 	nce_t	*free_nce_list = NULL;
934 
935 	mutex_enter(&ndp->ndp_g_lock);
936 	/* Prevent ndp_delete from unlink and free of NCE */
937 	ndp->ndp_g_walker++;
938 	mutex_exit(&ndp->ndp_g_lock);
939 	for (ncep = ndp->nce_hash_tbl;
940 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
941 		for (nce = *ncep; nce != NULL; nce = nce1) {
942 			nce1 = nce->nce_next;
943 			if (ill == NULL || nce->nce_ill == ill) {
944 				if (trace) {
945 					NCE_REFHOLD(nce);
946 					(*pfi)(nce, arg1);
947 					NCE_REFRELE(nce);
948 				} else {
949 					NCE_REFHOLD_NOTR(nce);
950 					(*pfi)(nce, arg1);
951 					NCE_REFRELE_NOTR(nce);
952 				}
953 			}
954 		}
955 	}
956 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
957 		nce1 = nce->nce_next;
958 		if (ill == NULL || nce->nce_ill == ill) {
959 			if (trace) {
960 				NCE_REFHOLD(nce);
961 				(*pfi)(nce, arg1);
962 				NCE_REFRELE(nce);
963 			} else {
964 				NCE_REFHOLD_NOTR(nce);
965 				(*pfi)(nce, arg1);
966 				NCE_REFRELE_NOTR(nce);
967 			}
968 		}
969 	}
970 	mutex_enter(&ndp->ndp_g_lock);
971 	ndp->ndp_g_walker--;
972 	/*
973 	 * While NCE's are removed from global list they are placed
974 	 * in a private list, to be passed to nce_ire_delete_list().
975 	 * The reason is, there may be ires pointing to this nce
976 	 * which needs to cleaned up.
977 	 */
978 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
979 		/* Time to delete condemned entries */
980 		for (ncep = ndp->nce_hash_tbl;
981 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
982 			nce = *ncep;
983 			if (nce != NULL) {
984 				nce_remove(ndp, nce, &free_nce_list);
985 			}
986 		}
987 		nce = ndp->nce_mask_entries;
988 		if (nce != NULL) {
989 			nce_remove(ndp, nce, &free_nce_list);
990 		}
991 		ndp->ndp_g_walker_cleanup = B_FALSE;
992 	}
993 
994 	mutex_exit(&ndp->ndp_g_lock);
995 
996 	if (free_nce_list != NULL) {
997 		nce_ire_delete_list(free_nce_list);
998 	}
999 }
1000 
1001 /*
1002  * Walk everything.
1003  * Note that ill can be NULL hence can't derive the ipst from it.
1004  */
1005 void
1006 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1007 {
1008 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1009 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1010 }
1011 
1012 /*
1013  * Process resolve requests.  Handles both mapped entries
1014  * as well as cases that needs to be send out on the wire.
1015  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1016  * or one is created, we defer making ire point to nce until the
1017  * ire is actually added at which point the nce_refcnt on the nce is
1018  * incremented.  This is done primarily to have symmetry between ire_add()
1019  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1020  */
1021 int
1022 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1023 {
1024 	nce_t		*nce;
1025 	int		err = 0;
1026 	uint32_t	ms;
1027 	mblk_t		*mp_nce = NULL;
1028 	ip_stack_t	*ipst = ill->ill_ipst;
1029 
1030 	ASSERT(ill->ill_isv6);
1031 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1032 		err = nce_set_multicast(ill, dst);
1033 		return (err);
1034 	}
1035 	err = ndp_lookup_then_add_v6(ill,
1036 	    NULL,	/* No hardware address */
1037 	    dst,
1038 	    &ipv6_all_ones,
1039 	    &ipv6_all_zeros,
1040 	    0,
1041 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1042 	    ND_INCOMPLETE,
1043 	    &nce);
1044 
1045 	switch (err) {
1046 	case 0:
1047 		/*
1048 		 * New cache entry was created. Make sure that the state
1049 		 * is not ND_INCOMPLETE. It can be in some other state
1050 		 * even before we send out the solicitation as we could
1051 		 * get un-solicited advertisements.
1052 		 *
1053 		 * If this is an XRESOLV interface, simply return 0,
1054 		 * since we don't want to solicit just yet.
1055 		 */
1056 		if (ill->ill_flags & ILLF_XRESOLV) {
1057 			NCE_REFRELE(nce);
1058 			return (0);
1059 		}
1060 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1061 		mutex_enter(&nce->nce_lock);
1062 		if (nce->nce_state != ND_INCOMPLETE) {
1063 			mutex_exit(&nce->nce_lock);
1064 			rw_exit(&ipst->ips_ill_g_lock);
1065 			NCE_REFRELE(nce);
1066 			return (0);
1067 		}
1068 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1069 		if (mp_nce == NULL) {
1070 			/* The caller will free mp */
1071 			mutex_exit(&nce->nce_lock);
1072 			rw_exit(&ipst->ips_ill_g_lock);
1073 			ndp_delete(nce);
1074 			NCE_REFRELE(nce);
1075 			return (ENOMEM);
1076 		}
1077 		ms = nce_solicit(nce, mp_nce);
1078 		rw_exit(&ipst->ips_ill_g_lock);
1079 		if (ms == 0) {
1080 			/* The caller will free mp */
1081 			if (mp_nce != mp)
1082 				freeb(mp_nce);
1083 			mutex_exit(&nce->nce_lock);
1084 			ndp_delete(nce);
1085 			NCE_REFRELE(nce);
1086 			return (EBUSY);
1087 		}
1088 		mutex_exit(&nce->nce_lock);
1089 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1090 		NCE_REFRELE(nce);
1091 		return (EINPROGRESS);
1092 	case EEXIST:
1093 		/* Resolution in progress just queue the packet */
1094 		mutex_enter(&nce->nce_lock);
1095 		if (nce->nce_state == ND_INCOMPLETE) {
1096 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1097 			if (mp_nce == NULL) {
1098 				err = ENOMEM;
1099 			} else {
1100 				nce_queue_mp(nce, mp_nce);
1101 				err = EINPROGRESS;
1102 			}
1103 		} else {
1104 			/*
1105 			 * Any other state implies we have
1106 			 * a nce but IRE needs to be added ...
1107 			 * ire_add_v6() will take care of the
1108 			 * the case when the nce becomes CONDEMNED
1109 			 * before the ire is added to the table.
1110 			 */
1111 			err = 0;
1112 		}
1113 		mutex_exit(&nce->nce_lock);
1114 		NCE_REFRELE(nce);
1115 		break;
1116 	default:
1117 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1118 		break;
1119 	}
1120 	return (err);
1121 }
1122 
1123 /*
1124  * When there is no resolver, the link layer template is passed in
1125  * the IRE.
1126  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1127  * or one is created, we defer making ire point to nce until the
1128  * ire is actually added at which point the nce_refcnt on the nce is
1129  * incremented.  This is done primarily to have symmetry between ire_add()
1130  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1131  */
1132 int
1133 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1134 {
1135 	nce_t		*nce;
1136 	int		err = 0;
1137 
1138 	ASSERT(ill != NULL);
1139 	ASSERT(ill->ill_isv6);
1140 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1141 		err = nce_set_multicast(ill, dst);
1142 		return (err);
1143 	}
1144 
1145 	err = ndp_lookup_then_add_v6(ill,
1146 	    NULL,	/* hardware address */
1147 	    dst,
1148 	    &ipv6_all_ones,
1149 	    &ipv6_all_zeros,
1150 	    0,
1151 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1152 	    ND_REACHABLE,
1153 	    &nce);
1154 
1155 	switch (err) {
1156 	case 0:
1157 		/*
1158 		 * Cache entry with a proper resolver cookie was
1159 		 * created.
1160 		 */
1161 		NCE_REFRELE(nce);
1162 		break;
1163 	case EEXIST:
1164 		err = 0;
1165 		NCE_REFRELE(nce);
1166 		break;
1167 	default:
1168 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1169 		break;
1170 	}
1171 	return (err);
1172 }
1173 
1174 /*
1175  * For each interface an entry is added for the unspecified multicast group.
1176  * Here that mapping is used to form the multicast cache entry for a particular
1177  * multicast destination.
1178  */
1179 static int
1180 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1181 {
1182 	nce_t		*mnce;	/* Multicast mapping entry */
1183 	nce_t		*nce;
1184 	uchar_t		*hw_addr = NULL;
1185 	int		err = 0;
1186 	ip_stack_t	*ipst = ill->ill_ipst;
1187 
1188 	ASSERT(ill != NULL);
1189 	ASSERT(ill->ill_isv6);
1190 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1191 
1192 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1193 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1194 	nce = nce_lookup_addr(ill, dst, nce);
1195 	if (nce != NULL) {
1196 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1197 		NCE_REFRELE(nce);
1198 		return (0);
1199 	}
1200 	/* No entry, now lookup for a mapping this should never fail */
1201 	mnce = nce_lookup_mapping(ill, dst);
1202 	if (mnce == NULL) {
1203 		/* Something broken for the interface. */
1204 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1205 		return (ESRCH);
1206 	}
1207 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1208 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1209 		/*
1210 		 * For IRE_IF_RESOLVER a hardware mapping can be
1211 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1212 		 * in the ill is copied in ndp_add_v6().
1213 		 */
1214 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1215 		if (hw_addr == NULL) {
1216 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1217 			NCE_REFRELE(mnce);
1218 			return (ENOMEM);
1219 		}
1220 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1221 	}
1222 	NCE_REFRELE(mnce);
1223 	/*
1224 	 * IRE_IF_NORESOLVER type simply copies the resolution
1225 	 * cookie passed in.  So no hw_addr is needed.
1226 	 */
1227 	err = ndp_add_v6(ill,
1228 	    hw_addr,
1229 	    dst,
1230 	    &ipv6_all_ones,
1231 	    &ipv6_all_zeros,
1232 	    0,
1233 	    NCE_F_NONUD,
1234 	    ND_REACHABLE,
1235 	    &nce);
1236 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1237 	if (hw_addr != NULL)
1238 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1239 	if (err != 0) {
1240 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1241 		return (err);
1242 	}
1243 	NCE_REFRELE(nce);
1244 	return (0);
1245 }
1246 
1247 /*
1248  * Return the link layer address, and any flags of a nce.
1249  */
1250 int
1251 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1252 {
1253 	nce_t		*nce;
1254 	in6_addr_t	*addr;
1255 	sin6_t		*sin6;
1256 	dl_unitdata_req_t	*dl;
1257 
1258 	ASSERT(ill != NULL && ill->ill_isv6);
1259 	sin6 = (sin6_t *)&lnr->lnr_addr;
1260 	addr =  &sin6->sin6_addr;
1261 
1262 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1263 	if (nce == NULL)
1264 		return (ESRCH);
1265 	/* If in INCOMPLETE state, no link layer address is available yet */
1266 	if (nce->nce_state == ND_INCOMPLETE)
1267 		goto done;
1268 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1269 	if (ill->ill_flags & ILLF_XRESOLV)
1270 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1271 	else
1272 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1273 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1274 	    sizeof (lnr->lnr_hdw_addr));
1275 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1276 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1277 	if (nce->nce_flags & NCE_F_ISROUTER)
1278 		lnr->lnr_flags = NDF_ISROUTER_ON;
1279 	if (nce->nce_flags & NCE_F_ANYCAST)
1280 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1281 done:
1282 	NCE_REFRELE(nce);
1283 	return (0);
1284 }
1285 
1286 /*
1287  * Send Enable/Disable multicast reqs to driver.
1288  */
1289 int
1290 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1291     uint32_t hw_addr_offset, mblk_t *mp)
1292 {
1293 	nce_t		*nce;
1294 	uchar_t		*hw_addr;
1295 	ip_stack_t	*ipst = ill->ill_ipst;
1296 
1297 	ASSERT(ill != NULL && ill->ill_isv6);
1298 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1299 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1300 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1301 		freemsg(mp);
1302 		return (EINVAL);
1303 	}
1304 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1305 	nce = nce_lookup_mapping(ill, addr);
1306 	if (nce == NULL) {
1307 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1308 		freemsg(mp);
1309 		return (ESRCH);
1310 	}
1311 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1312 	/*
1313 	 * Update dl_addr_length and dl_addr_offset for primitives that
1314 	 * have physical addresses as opposed to full saps
1315 	 */
1316 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1317 	case DL_ENABMULTI_REQ:
1318 		/* Track the state if this is the first enabmulti */
1319 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1320 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1321 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1322 		break;
1323 	case DL_DISABMULTI_REQ:
1324 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1325 		break;
1326 	default:
1327 		NCE_REFRELE(nce);
1328 		ip1dbg(("ndp_mcastreq: default\n"));
1329 		return (EINVAL);
1330 	}
1331 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1332 	NCE_REFRELE(nce);
1333 	ill_dlpi_send(ill, mp);
1334 	return (0);
1335 }
1336 
1337 /*
1338  * Send a neighbor solicitation.
1339  * Returns number of milliseconds after which we should either rexmit or abort.
1340  * Return of zero means we should abort.
1341  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1342  *
1343  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1344  * the packet.
1345  * NOTE: This routine does not consume mp.
1346  */
1347 uint32_t
1348 nce_solicit(nce_t *nce, mblk_t *mp)
1349 {
1350 	ill_t		*ill;
1351 	ill_t		*src_ill;
1352 	ip6_t		*ip6h;
1353 	in6_addr_t	src;
1354 	in6_addr_t	dst;
1355 	ipif_t		*ipif;
1356 	ip6i_t		*ip6i;
1357 	boolean_t	dropped = B_FALSE;
1358 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1359 
1360 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1361 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1362 	ill = nce->nce_ill;
1363 	ASSERT(ill != NULL);
1364 
1365 	if (nce->nce_rcnt == 0) {
1366 		return (0);
1367 	}
1368 
1369 	if (mp == NULL) {
1370 		ASSERT(nce->nce_qd_mp != NULL);
1371 		mp = nce->nce_qd_mp;
1372 	} else {
1373 		nce_queue_mp(nce, mp);
1374 	}
1375 
1376 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1377 	if (mp->b_datap->db_type == M_CTL)
1378 		mp = mp->b_cont;
1379 
1380 	ip6h = (ip6_t *)mp->b_rptr;
1381 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1382 		/*
1383 		 * This message should have been pulled up already in
1384 		 * ip_wput_v6. We can't do pullups here because the message
1385 		 * could be from the nce_qd_mp which could have b_next/b_prev
1386 		 * non-NULL.
1387 		 */
1388 		ip6i = (ip6i_t *)ip6h;
1389 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1390 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
1391 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1392 	}
1393 	src = ip6h->ip6_src;
1394 	/*
1395 	 * If the src of outgoing packet is one of the assigned interface
1396 	 * addresses use it, otherwise we will pick the source address below.
1397 	 */
1398 	src_ill = ill;
1399 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1400 		if (ill->ill_group != NULL)
1401 			src_ill = ill->ill_group->illgrp_ill;
1402 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1403 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1404 			    ipif = ipif->ipif_next) {
1405 				if (IN6_ARE_ADDR_EQUAL(&src,
1406 				    &ipif->ipif_v6lcl_addr)) {
1407 					break;
1408 				}
1409 			}
1410 			if (ipif != NULL)
1411 				break;
1412 		}
1413 		/*
1414 		 * If no relevant ipif can be found, then it's not one of our
1415 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1416 		 * found, but it's not yet done with DAD verification, then
1417 		 * just postpone this transmission until later.
1418 		 */
1419 		if (src_ill == NULL)
1420 			src = ipv6_all_zeros;
1421 		else if (!ipif->ipif_addr_ready)
1422 			return (ill->ill_reachable_retrans_time);
1423 	}
1424 	dst = nce->nce_addr;
1425 	/*
1426 	 * If source address is unspecified, nce_xmit will choose
1427 	 * one for us and initialize the hardware address also
1428 	 * appropriately.
1429 	 */
1430 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1431 		src_ill = NULL;
1432 	nce->nce_rcnt--;
1433 	mutex_exit(&nce->nce_lock);
1434 	rw_exit(&ipst->ips_ill_g_lock);
1435 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1436 	    &dst, 0);
1437 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1438 	mutex_enter(&nce->nce_lock);
1439 	if (dropped)
1440 		nce->nce_rcnt++;
1441 	return (ill->ill_reachable_retrans_time);
1442 }
1443 
1444 /*
1445  * Attempt to recover an address on an interface that's been marked as a
1446  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1447  * no easy way to just probe the address and have the right thing happen if
1448  * it's no longer in use.  Instead, we just bring it up normally and allow the
1449  * regular interface start-up logic to probe for a remaining duplicate and take
1450  * us back down if necessary.
1451  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1452  * ip_ndp_excl.
1453  */
1454 /* ARGSUSED */
1455 static void
1456 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1457 {
1458 	ill_t	*ill = rq->q_ptr;
1459 	ipif_t	*ipif;
1460 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1461 
1462 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1463 		/*
1464 		 * We do not support recovery of proxy ARP'd interfaces,
1465 		 * because the system lacks a complete proxy ARP mechanism.
1466 		 */
1467 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1468 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1469 			continue;
1470 		}
1471 
1472 		/*
1473 		 * If we have already recovered or if the interface is going
1474 		 * away, then ignore.
1475 		 */
1476 		mutex_enter(&ill->ill_lock);
1477 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1478 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1479 			mutex_exit(&ill->ill_lock);
1480 			continue;
1481 		}
1482 
1483 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1484 		ill->ill_ipif_dup_count--;
1485 		mutex_exit(&ill->ill_lock);
1486 		ipif->ipif_was_dup = B_TRUE;
1487 
1488 		if (ipif_ndp_up(ipif) != EINPROGRESS)
1489 			(void) ipif_up_done_v6(ipif);
1490 	}
1491 	freeb(mp);
1492 }
1493 
1494 /*
1495  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1496  * As long as someone else holds the address, the interface will stay down.
1497  * When that conflict goes away, the interface is brought back up.  This is
1498  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1499  * server will recover from a failure.
1500  *
1501  * For DHCP and temporary addresses, recovery is not done in the kernel.
1502  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1503  *
1504  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1505  */
1506 static void
1507 ipif6_dup_recovery(void *arg)
1508 {
1509 	ipif_t *ipif = arg;
1510 
1511 	ipif->ipif_recovery_id = 0;
1512 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1513 		return;
1514 
1515 	/*
1516 	 * No lock, because this is just an optimization.
1517 	 */
1518 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1519 		return;
1520 
1521 	/* If the link is down, we'll retry this later */
1522 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1523 		return;
1524 
1525 	ndp_do_recovery(ipif);
1526 }
1527 
1528 /*
1529  * Perform interface recovery by forcing the duplicate interfaces up and
1530  * allowing the system to determine which ones should stay up.
1531  *
1532  * Called both by recovery timer expiry and link-up notification.
1533  */
1534 void
1535 ndp_do_recovery(ipif_t *ipif)
1536 {
1537 	ill_t *ill = ipif->ipif_ill;
1538 	mblk_t *mp;
1539 	ip_stack_t *ipst = ill->ill_ipst;
1540 
1541 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1542 	if (mp == NULL) {
1543 		mutex_enter(&ill->ill_lock);
1544 		if (ipif->ipif_recovery_id == 0 &&
1545 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1546 		    IPIF_CONDEMNED))) {
1547 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1548 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1549 		}
1550 		mutex_exit(&ill->ill_lock);
1551 	} else {
1552 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1553 		    sizeof (ipif->ipif_v6lcl_addr));
1554 		ill_refhold(ill);
1555 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1556 		    B_FALSE);
1557 	}
1558 }
1559 
1560 /*
1561  * Find the solicitation in the given message, and extract printable details
1562  * (MAC and IP addresses) from it.
1563  */
1564 static nd_neighbor_solicit_t *
1565 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1566     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1567 {
1568 	nd_neighbor_solicit_t *ns;
1569 	ip6_t *ip6h;
1570 	uchar_t *addr;
1571 	int alen;
1572 
1573 	alen = 0;
1574 	ip6h = (ip6_t *)mp->b_rptr;
1575 	if (dl_mp == NULL) {
1576 		nd_opt_hdr_t *opt;
1577 		int nslen;
1578 
1579 		/*
1580 		 * If it's from the fast-path, then it can't be a probe
1581 		 * message, and thus must include the source linkaddr option.
1582 		 * Extract that here.
1583 		 */
1584 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1585 		nslen = mp->b_wptr - (uchar_t *)ns;
1586 		if ((nslen -= sizeof (*ns)) > 0) {
1587 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1588 			    ND_OPT_SOURCE_LINKADDR);
1589 			if (opt != NULL &&
1590 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1591 			    ill->ill_nd_lla_len) {
1592 				addr = (uchar_t *)(opt + 1);
1593 				alen = ill->ill_nd_lla_len;
1594 			}
1595 		}
1596 		/*
1597 		 * We cheat a bit here for the sake of printing usable log
1598 		 * messages in the rare case where the reply we got was unicast
1599 		 * without a source linkaddr option, and the interface is in
1600 		 * fastpath mode.  (Sigh.)
1601 		 */
1602 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1603 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1604 			struct ether_header *pether;
1605 
1606 			pether = (struct ether_header *)((char *)ip6h -
1607 			    sizeof (*pether));
1608 			addr = pether->ether_shost.ether_addr_octet;
1609 			alen = ETHERADDRL;
1610 		}
1611 	} else {
1612 		dl_unitdata_ind_t *dlu;
1613 
1614 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1615 		alen = dlu->dl_src_addr_length;
1616 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1617 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1618 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1619 			if (ill->ill_sap_length < 0) {
1620 				alen += ill->ill_sap_length;
1621 			} else {
1622 				addr += ill->ill_sap_length;
1623 				alen -= ill->ill_sap_length;
1624 			}
1625 		}
1626 	}
1627 	if (alen > 0) {
1628 		*haddr = addr;
1629 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1630 	} else {
1631 		*haddr = NULL;
1632 		(void) strcpy(hbuf, "?");
1633 	}
1634 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1635 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1636 	return (ns);
1637 }
1638 
1639 /*
1640  * This is for exclusive changes due to NDP duplicate address detection
1641  * failure.
1642  */
1643 /* ARGSUSED */
1644 static void
1645 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1646 {
1647 	ill_t	*ill = rq->q_ptr;
1648 	ipif_t	*ipif;
1649 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1650 	char hbuf[MAC_STR_LEN];
1651 	char sbuf[INET6_ADDRSTRLEN];
1652 	nd_neighbor_solicit_t *ns;
1653 	mblk_t *dl_mp = NULL;
1654 	uchar_t *haddr;
1655 	ip_stack_t *ipst = ill->ill_ipst;
1656 
1657 	if (DB_TYPE(mp) != M_DATA) {
1658 		dl_mp = mp;
1659 		mp = mp->b_cont;
1660 	}
1661 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1662 	    sizeof (sbuf), &haddr);
1663 	if (haddr != NULL &&
1664 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1665 		/*
1666 		 * Ignore conflicts generated by misbehaving switches that just
1667 		 * reflect our own messages back to us.
1668 		 */
1669 		goto ignore_conflict;
1670 	}
1671 
1672 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1673 
1674 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1675 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1676 		    &ns->nd_ns_target)) {
1677 			continue;
1678 		}
1679 
1680 		/* If it's already marked, then don't do anything. */
1681 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1682 			continue;
1683 
1684 		/*
1685 		 * If this is a failure during duplicate recovery, then don't
1686 		 * complain.  It may take a long time to recover.
1687 		 */
1688 		if (!ipif->ipif_was_dup) {
1689 			ipif_get_name(ipif, ibuf, sizeof (ibuf));
1690 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1691 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1692 		}
1693 		mutex_enter(&ill->ill_lock);
1694 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1695 		ipif->ipif_flags |= IPIF_DUPLICATE;
1696 		ill->ill_ipif_dup_count++;
1697 		mutex_exit(&ill->ill_lock);
1698 		(void) ipif_down(ipif, NULL, NULL);
1699 		ipif_down_tail(ipif);
1700 		mutex_enter(&ill->ill_lock);
1701 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1702 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1703 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1704 		    IPIF_CONDEMNED)) &&
1705 		    ipst->ips_ip_dup_recovery > 0) {
1706 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1707 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1708 		}
1709 		mutex_exit(&ill->ill_lock);
1710 	}
1711 ignore_conflict:
1712 	if (dl_mp != NULL)
1713 		freeb(dl_mp);
1714 	freemsg(mp);
1715 }
1716 
1717 /*
1718  * Handle failure by tearing down the ipifs with the specified address.  Note
1719  * that tearing down the ipif also means deleting the nce through ipif_down, so
1720  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1721  * we start a timer on the ipif.
1722  */
1723 static void
1724 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1725 {
1726 	if ((mp = copymsg(mp)) != NULL) {
1727 		if (dl_mp == NULL)
1728 			dl_mp = mp;
1729 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1730 			dl_mp->b_cont = mp;
1731 		if (dl_mp == NULL) {
1732 			freemsg(mp);
1733 		} else {
1734 			ill_refhold(ill);
1735 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1736 			    B_FALSE);
1737 		}
1738 	}
1739 	ndp_delete(nce);
1740 }
1741 
1742 /*
1743  * Handle a discovered conflict: some other system is advertising that it owns
1744  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1745  * interface.
1746  */
1747 static void
1748 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1749 {
1750 	ipif_t *ipif;
1751 	uint32_t now;
1752 	uint_t maxdefense;
1753 	uint_t defs;
1754 	ip_stack_t *ipst = ill->ill_ipst;
1755 
1756 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1757 	    NULL, NULL, ipst);
1758 	if (ipif == NULL)
1759 		return;
1760 	/*
1761 	 * First, figure out if this address is disposable.
1762 	 */
1763 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1764 		maxdefense = ipst->ips_ip_max_temp_defend;
1765 	else
1766 		maxdefense = ipst->ips_ip_max_defend;
1767 
1768 	/*
1769 	 * Now figure out how many times we've defended ourselves.  Ignore
1770 	 * defenses that happened long in the past.
1771 	 */
1772 	now = gethrestime_sec();
1773 	mutex_enter(&nce->nce_lock);
1774 	if ((defs = nce->nce_defense_count) > 0 &&
1775 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1776 		nce->nce_defense_count = defs = 0;
1777 	}
1778 	nce->nce_defense_count++;
1779 	nce->nce_defense_time = now;
1780 	mutex_exit(&nce->nce_lock);
1781 	ipif_refrele(ipif);
1782 
1783 	/*
1784 	 * If we've defended ourselves too many times already, then give up and
1785 	 * tear down the interface(s) using this address.  Otherwise, defend by
1786 	 * sending out an unsolicited Neighbor Advertisement.
1787 	 */
1788 	if (defs >= maxdefense) {
1789 		ip_ndp_failure(ill, mp, dl_mp, nce);
1790 	} else {
1791 		char hbuf[MAC_STR_LEN];
1792 		char sbuf[INET6_ADDRSTRLEN];
1793 		uchar_t *haddr;
1794 
1795 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1796 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1797 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1798 		    hbuf, sbuf, ill->ill_name);
1799 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1800 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1801 		    nce_advert_flags(nce));
1802 	}
1803 }
1804 
1805 static void
1806 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1807 {
1808 	nd_neighbor_solicit_t *ns;
1809 	uint32_t	hlen = ill->ill_nd_lla_len;
1810 	uchar_t		*haddr = NULL;
1811 	icmp6_t		*icmp_nd;
1812 	ip6_t		*ip6h;
1813 	nce_t		*our_nce = NULL;
1814 	in6_addr_t	target;
1815 	in6_addr_t	src;
1816 	int		len;
1817 	int		flag = 0;
1818 	nd_opt_hdr_t	*opt = NULL;
1819 	boolean_t	bad_solicit = B_FALSE;
1820 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1821 
1822 	ip6h = (ip6_t *)mp->b_rptr;
1823 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1824 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1825 	src = ip6h->ip6_src;
1826 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1827 	target = ns->nd_ns_target;
1828 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1829 		if (ip_debug > 2) {
1830 			/* ip1dbg */
1831 			pr_addr_dbg("ndp_input_solicit: Target is"
1832 			    " multicast! %s\n", AF_INET6, &target);
1833 		}
1834 		bad_solicit = B_TRUE;
1835 		goto done;
1836 	}
1837 	if (len > sizeof (nd_neighbor_solicit_t)) {
1838 		/* Options present */
1839 		opt = (nd_opt_hdr_t *)&ns[1];
1840 		len -= sizeof (nd_neighbor_solicit_t);
1841 		if (!ndp_verify_optlen(opt, len)) {
1842 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1843 			bad_solicit = B_TRUE;
1844 			goto done;
1845 		}
1846 	}
1847 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1848 		/* Check to see if this is a valid DAD solicitation */
1849 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1850 			if (ip_debug > 2) {
1851 				/* ip1dbg */
1852 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1853 				    "Destination is not solicited node "
1854 				    "multicast %s\n", AF_INET6,
1855 				    &ip6h->ip6_dst);
1856 			}
1857 			bad_solicit = B_TRUE;
1858 			goto done;
1859 		}
1860 	}
1861 
1862 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1863 	/*
1864 	 * If this is a valid Solicitation, a permanent
1865 	 * entry should exist in the cache
1866 	 */
1867 	if (our_nce == NULL ||
1868 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1869 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1870 		    "ifname=%s ", ill->ill_name));
1871 		if (ip_debug > 2) {
1872 			/* ip1dbg */
1873 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1874 		}
1875 		bad_solicit = B_TRUE;
1876 		goto done;
1877 	}
1878 
1879 	/* At this point we should have a verified NS per spec */
1880 	if (opt != NULL) {
1881 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1882 		if (opt != NULL) {
1883 			haddr = (uchar_t *)&opt[1];
1884 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1885 			    hlen == 0) {
1886 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1887 				bad_solicit = B_TRUE;
1888 				goto done;
1889 			}
1890 		}
1891 	}
1892 
1893 	/* If sending directly to peer, set the unicast flag */
1894 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1895 		flag |= NDP_UNICAST;
1896 
1897 	/*
1898 	 * Create/update the entry for the soliciting node.
1899 	 * or respond to outstanding queries, don't if
1900 	 * the source is unspecified address.
1901 	 */
1902 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1903 		int	err;
1904 		nce_t	*nnce;
1905 
1906 		ASSERT(ill->ill_isv6);
1907 		/*
1908 		 * Regular solicitations *must* include the Source Link-Layer
1909 		 * Address option.  Ignore messages that do not.
1910 		 */
1911 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1912 			ip1dbg(("ndp_input_solicit: source link-layer address "
1913 			    "option missing with a specified source.\n"));
1914 			bad_solicit = B_TRUE;
1915 			goto done;
1916 		}
1917 
1918 		/*
1919 		 * This is a regular solicitation.  If we're still in the
1920 		 * process of verifying the address, then don't respond at all
1921 		 * and don't keep track of the sender.
1922 		 */
1923 		if (our_nce->nce_state == ND_PROBE)
1924 			goto done;
1925 
1926 		/*
1927 		 * If the solicitation doesn't have sender hardware address
1928 		 * (legal for unicast solicitation), then process without
1929 		 * installing the return NCE.  Either we already know it, or
1930 		 * we'll be forced to look it up when (and if) we reply to the
1931 		 * packet.
1932 		 */
1933 		if (haddr == NULL)
1934 			goto no_source;
1935 
1936 		err = ndp_lookup_then_add_v6(ill,
1937 		    haddr,
1938 		    &src,	/* Soliciting nodes address */
1939 		    &ipv6_all_ones,
1940 		    &ipv6_all_zeros,
1941 		    0,
1942 		    0,
1943 		    ND_STALE,
1944 		    &nnce);
1945 		switch (err) {
1946 		case 0:
1947 			/* done with this entry */
1948 			NCE_REFRELE(nnce);
1949 			break;
1950 		case EEXIST:
1951 			/*
1952 			 * B_FALSE indicates this is not an
1953 			 * an advertisement.
1954 			 */
1955 			ndp_process(nnce, haddr, 0, B_FALSE);
1956 			NCE_REFRELE(nnce);
1957 			break;
1958 		default:
1959 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1960 			    err));
1961 			goto done;
1962 		}
1963 no_source:
1964 		flag |= NDP_SOLICITED;
1965 	} else {
1966 		/*
1967 		 * No source link layer address option should be present in a
1968 		 * valid DAD request.
1969 		 */
1970 		if (haddr != NULL) {
1971 			ip1dbg(("ndp_input_solicit: source link-layer address "
1972 			    "option present with an unspecified source.\n"));
1973 			bad_solicit = B_TRUE;
1974 			goto done;
1975 		}
1976 		if (our_nce->nce_state == ND_PROBE) {
1977 			/*
1978 			 * Internally looped-back probes won't have DLPI
1979 			 * attached to them.  External ones (which are sent by
1980 			 * multicast) always will.  Just ignore our own
1981 			 * transmissions.
1982 			 */
1983 			if (dl_mp != NULL) {
1984 				/*
1985 				 * If someone else is probing our address, then
1986 				 * we've crossed wires.  Declare failure.
1987 				 */
1988 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
1989 			}
1990 			goto done;
1991 		}
1992 		/*
1993 		 * This is a DAD probe.  Multicast the advertisement to the
1994 		 * all-nodes address.
1995 		 */
1996 		src = ipv6_all_hosts_mcast;
1997 	}
1998 	flag |= nce_advert_flags(our_nce);
1999 	/* Response to a solicitation */
2000 	(void) nce_xmit(ill,
2001 	    ND_NEIGHBOR_ADVERT,
2002 	    ill,	/* ill to be used for extracting ill_nd_lla */
2003 	    B_TRUE,	/* use ill_nd_lla */
2004 	    &target,	/* Source and target of the advertisement pkt */
2005 	    &src,	/* IP Destination (source of original pkt) */
2006 	    flag);
2007 done:
2008 	if (bad_solicit)
2009 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2010 	if (our_nce != NULL)
2011 		NCE_REFRELE(our_nce);
2012 }
2013 
2014 void
2015 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2016 {
2017 	nd_neighbor_advert_t *na;
2018 	uint32_t	hlen = ill->ill_nd_lla_len;
2019 	uchar_t		*haddr = NULL;
2020 	icmp6_t		*icmp_nd;
2021 	ip6_t		*ip6h;
2022 	nce_t		*dst_nce = NULL;
2023 	in6_addr_t	target;
2024 	nd_opt_hdr_t	*opt = NULL;
2025 	int		len;
2026 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2027 	ip_stack_t	*ipst = ill->ill_ipst;
2028 
2029 	ip6h = (ip6_t *)mp->b_rptr;
2030 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2031 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2032 	na = (nd_neighbor_advert_t *)icmp_nd;
2033 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2034 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2035 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2036 		    "solicited flag is not zero\n"));
2037 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2038 		return;
2039 	}
2040 	target = na->nd_na_target;
2041 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2042 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2043 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2044 		return;
2045 	}
2046 	if (len > sizeof (nd_neighbor_advert_t)) {
2047 		opt = (nd_opt_hdr_t *)&na[1];
2048 		if (!ndp_verify_optlen(opt,
2049 		    len - sizeof (nd_neighbor_advert_t))) {
2050 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2051 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2052 			return;
2053 		}
2054 		/* At this point we have a verified NA per spec */
2055 		len -= sizeof (nd_neighbor_advert_t);
2056 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2057 		if (opt != NULL) {
2058 			haddr = (uchar_t *)&opt[1];
2059 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2060 			    hlen == 0) {
2061 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2062 				BUMP_MIB(mib,
2063 				    ipv6IfIcmpInBadNeighborAdvertisements);
2064 				return;
2065 			}
2066 		}
2067 	}
2068 
2069 	/*
2070 	 * If this interface is part of the group look at all the
2071 	 * ills in the group.
2072 	 */
2073 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2074 	if (ill->ill_group != NULL)
2075 		ill = ill->ill_group->illgrp_ill;
2076 
2077 	for (; ill != NULL; ill = ill->ill_group_next) {
2078 		mutex_enter(&ill->ill_lock);
2079 		if (!ILL_CAN_LOOKUP(ill)) {
2080 			mutex_exit(&ill->ill_lock);
2081 			continue;
2082 		}
2083 		ill_refhold_locked(ill);
2084 		mutex_exit(&ill->ill_lock);
2085 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2086 		/* We have to drop the lock since ndp_process calls put* */
2087 		rw_exit(&ipst->ips_ill_g_lock);
2088 		if (dst_nce != NULL) {
2089 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2090 			    dst_nce->nce_state == ND_PROBE) {
2091 				/*
2092 				 * Someone else sent an advertisement for an
2093 				 * address that we're trying to configure.
2094 				 * Tear it down.  Note that dl_mp might be NULL
2095 				 * if we're getting a unicast reply.  This
2096 				 * isn't typically done (multicast is the norm
2097 				 * in response to a probe), but ip_ndp_failure
2098 				 * will handle the dl_mp == NULL case as well.
2099 				 */
2100 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2101 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2102 				/*
2103 				 * Someone just announced one of our local
2104 				 * addresses.  If it wasn't us, then this is a
2105 				 * conflict.  Defend the address or shut it
2106 				 * down.
2107 				 */
2108 				if (dl_mp != NULL &&
2109 				    (haddr == NULL ||
2110 				    nce_cmp_ll_addr(dst_nce, haddr,
2111 				    ill->ill_nd_lla_len))) {
2112 					ip_ndp_conflict(ill, mp, dl_mp,
2113 					    dst_nce);
2114 				}
2115 			} else {
2116 				if (na->nd_na_flags_reserved &
2117 				    ND_NA_FLAG_ROUTER) {
2118 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2119 				}
2120 				/* B_TRUE indicates this an advertisement */
2121 				ndp_process(dst_nce, haddr,
2122 				    na->nd_na_flags_reserved, B_TRUE);
2123 			}
2124 			NCE_REFRELE(dst_nce);
2125 		}
2126 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2127 		ill_refrele(ill);
2128 	}
2129 	rw_exit(&ipst->ips_ill_g_lock);
2130 }
2131 
2132 /*
2133  * Process NDP neighbor solicitation/advertisement messages.
2134  * The checksum has already checked o.k before reaching here.
2135  */
2136 void
2137 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2138 {
2139 	icmp6_t		*icmp_nd;
2140 	ip6_t		*ip6h;
2141 	int		len;
2142 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2143 
2144 
2145 	if (!pullupmsg(mp, -1)) {
2146 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2147 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2148 		goto done;
2149 	}
2150 	ip6h = (ip6_t *)mp->b_rptr;
2151 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2152 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2153 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2154 		goto done;
2155 	}
2156 	/*
2157 	 * NDP does not accept any extension headers between the
2158 	 * IP header and the ICMP header since e.g. a routing
2159 	 * header could be dangerous.
2160 	 * This assumes that any AH or ESP headers are removed
2161 	 * by ip prior to passing the packet to ndp_input.
2162 	 */
2163 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2164 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2165 		    ip6h->ip6_nxt));
2166 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2167 		goto done;
2168 	}
2169 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2170 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2171 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2172 	if (icmp_nd->icmp6_code != 0) {
2173 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2174 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2175 		goto done;
2176 	}
2177 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2178 	/*
2179 	 * Make sure packet length is large enough for either
2180 	 * a NS or a NA icmp packet.
2181 	 */
2182 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2183 		ip1dbg(("ndp_input: packet too short\n"));
2184 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2185 		goto done;
2186 	}
2187 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2188 		ndp_input_solicit(ill, mp, dl_mp);
2189 	} else {
2190 		ndp_input_advert(ill, mp, dl_mp);
2191 	}
2192 done:
2193 	freemsg(mp);
2194 }
2195 
2196 /*
2197  * nce_xmit is called to form and transmit a ND solicitation or
2198  * advertisement ICMP packet.
2199  *
2200  * If the source address is unspecified and this isn't a probe (used for
2201  * duplicate address detection), an appropriate source address and link layer
2202  * address will be chosen here.  The link layer address option is included if
2203  * the source is specified (i.e., all non-probe packets), and omitted (per the
2204  * specification) otherwise.
2205  *
2206  * It returns B_FALSE only if it does a successful put() to the
2207  * corresponding ill's ill_wq otherwise returns B_TRUE.
2208  */
2209 static boolean_t
2210 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2211     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2212     int flag)
2213 {
2214 	uint32_t	len;
2215 	icmp6_t 	*icmp6;
2216 	mblk_t		*mp;
2217 	ip6_t		*ip6h;
2218 	nd_opt_hdr_t	*opt;
2219 	uint_t		plen;
2220 	ip6i_t		*ip6i;
2221 	ipif_t		*src_ipif = NULL;
2222 	uint8_t		*hw_addr;
2223 	zoneid_t	zoneid = GLOBAL_ZONEID;
2224 
2225 	/*
2226 	 * If we have a unspecified source(sender) address, select a
2227 	 * proper source address for the solicitation here itself so
2228 	 * that we can initialize the h/w address correctly. This is
2229 	 * needed for interface groups as source address can come from
2230 	 * the whole group and the h/w address initialized from ill will
2231 	 * be wrong if the source address comes from a different ill.
2232 	 *
2233 	 * If the sender is specified then we use this address in order
2234 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2235 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2236 	 * by IP (we cannot guarantee that the global zone has an interface
2237 	 * route to the destination).
2238 	 *
2239 	 * Note that the NA never comes here with the unspecified source
2240 	 * address. The following asserts that whenever the source
2241 	 * address is specified, the haddr also should be specified.
2242 	 */
2243 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2244 
2245 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2246 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2247 		/*
2248 		 * Pick a source address for this solicitation, but
2249 		 * restrict the selection to addresses assigned to the
2250 		 * output interface (or interface group).  We do this
2251 		 * because the destination will create a neighbor cache
2252 		 * entry for the source address of this packet, so the
2253 		 * source address had better be a valid neighbor.
2254 		 */
2255 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2256 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2257 		if (src_ipif == NULL) {
2258 			char buf[INET6_ADDRSTRLEN];
2259 
2260 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2261 			    inet_ntop(AF_INET6, (char *)target, buf,
2262 			    sizeof (buf))));
2263 			return (B_TRUE);
2264 		}
2265 		sender = &src_ipif->ipif_v6src_addr;
2266 		hwaddr_ill = src_ipif->ipif_ill;
2267 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2268 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2269 		/*
2270 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2271 		 * ALL_ZONES if it cannot find a matching ipif for the address
2272 		 * we are trying to use. In this case we err on the side of
2273 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2274 		 */
2275 		if (zoneid == ALL_ZONES)
2276 			zoneid = GLOBAL_ZONEID;
2277 	}
2278 
2279 	/*
2280 	 * Always make sure that the NS/NA packets don't get load
2281 	 * spread. This is needed so that the probe packets sent
2282 	 * by the in.mpathd daemon can really go out on the desired
2283 	 * interface. Probe packets are made to go out on a desired
2284 	 * interface by including a ip6i with ATTACH_IF flag. As these
2285 	 * packets indirectly end up sending/receiving NS/NA packets
2286 	 * (neighbor doing NUD), we have to make sure that NA
2287 	 * also go out on the same interface.
2288 	 */
2289 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2290 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2291 	    plen * 8;
2292 	mp = allocb(len,  BPRI_LO);
2293 	if (mp == NULL) {
2294 		if (src_ipif != NULL)
2295 			ipif_refrele(src_ipif);
2296 		return (B_TRUE);
2297 	}
2298 	bzero((char *)mp->b_rptr, len);
2299 	mp->b_wptr = mp->b_rptr + len;
2300 
2301 	ip6i = (ip6i_t *)mp->b_rptr;
2302 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2303 	ip6i->ip6i_nxt = IPPROTO_RAW;
2304 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2305 	if (flag & NDP_PROBE)
2306 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2307 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2308 
2309 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2310 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2311 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2312 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2313 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2314 	ip6h->ip6_dst = *target;
2315 	icmp6 = (icmp6_t *)&ip6h[1];
2316 
2317 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2318 	    sizeof (nd_neighbor_advert_t));
2319 
2320 	if (operation == ND_NEIGHBOR_SOLICIT) {
2321 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2322 
2323 		if (!(flag & NDP_PROBE))
2324 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2325 		ip6h->ip6_src = *sender;
2326 		ns->nd_ns_target = *target;
2327 		if (!(flag & NDP_UNICAST)) {
2328 			/* Form multicast address of the target */
2329 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2330 			ip6h->ip6_dst.s6_addr32[3] |=
2331 			    ns->nd_ns_target.s6_addr32[3];
2332 		}
2333 	} else {
2334 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2335 
2336 		ASSERT(!(flag & NDP_PROBE));
2337 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2338 		ip6h->ip6_src = *sender;
2339 		na->nd_na_target = *sender;
2340 		if (flag & NDP_ISROUTER)
2341 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2342 		if (flag & NDP_SOLICITED)
2343 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2344 		if (flag & NDP_ORIDE)
2345 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2346 	}
2347 
2348 	hw_addr = NULL;
2349 	if (!(flag & NDP_PROBE)) {
2350 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2351 		    hwaddr_ill->ill_phys_addr;
2352 		if (hw_addr != NULL) {
2353 			/* Fill in link layer address and option len */
2354 			opt->nd_opt_len = (uint8_t)plen;
2355 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2356 		}
2357 	}
2358 	if (hw_addr == NULL) {
2359 		/* If there's no link layer address option, then strip it. */
2360 		len -= plen * 8;
2361 		mp->b_wptr = mp->b_rptr + len;
2362 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2363 	}
2364 
2365 	icmp6->icmp6_type = (uint8_t)operation;
2366 	icmp6->icmp6_code = 0;
2367 	/*
2368 	 * Prepare for checksum by putting icmp length in the icmp
2369 	 * checksum field. The checksum is calculated in ip_wput_v6.
2370 	 */
2371 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2372 
2373 	if (src_ipif != NULL)
2374 		ipif_refrele(src_ipif);
2375 
2376 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2377 	return (B_FALSE);
2378 }
2379 
2380 /*
2381  * Make a link layer address (does not include the SAP) from an nce.
2382  * To form the link layer address, use the last four bytes of ipv6
2383  * address passed in and the fixed offset stored in nce.
2384  */
2385 static void
2386 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2387 {
2388 	uchar_t *mask, *to;
2389 	ill_t	*ill = nce->nce_ill;
2390 	int 	len;
2391 
2392 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2393 		return;
2394 	ASSERT(nce->nce_res_mp != NULL);
2395 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2396 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2397 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2398 	ASSERT(addr != NULL);
2399 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2400 	    addrpos, ill->ill_nd_lla_len);
2401 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2402 	    IPV6_ADDR_LEN);
2403 	mask = (uchar_t *)&nce->nce_extract_mask;
2404 	mask += (IPV6_ADDR_LEN - len);
2405 	addr += (IPV6_ADDR_LEN - len);
2406 	to = addrpos + nce->nce_ll_extract_start;
2407 	while (len-- > 0)
2408 		*to++ |= *mask++ & *addr++;
2409 }
2410 
2411 mblk_t *
2412 nce_udreq_alloc(ill_t *ill)
2413 {
2414 	mblk_t	*template_mp = NULL;
2415 	dl_unitdata_req_t *dlur;
2416 	int	sap_length;
2417 
2418 	ASSERT(ill->ill_isv6);
2419 
2420 	sap_length = ill->ill_sap_length;
2421 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2422 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2423 	if (template_mp == NULL)
2424 		return (NULL);
2425 
2426 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2427 	dlur->dl_priority.dl_min = 0;
2428 	dlur->dl_priority.dl_max = 0;
2429 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2430 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2431 
2432 	/* Copy in the SAP value. */
2433 	NCE_LL_SAP_COPY(ill, template_mp);
2434 
2435 	return (template_mp);
2436 }
2437 
2438 /*
2439  * NDP retransmit timer.
2440  * This timer goes off when:
2441  * a. It is time to retransmit NS for resolver.
2442  * b. It is time to send reachability probes.
2443  */
2444 void
2445 ndp_timer(void *arg)
2446 {
2447 	nce_t		*nce = arg;
2448 	ill_t		*ill = nce->nce_ill;
2449 	uint32_t	ms;
2450 	char		addrbuf[INET6_ADDRSTRLEN];
2451 	mblk_t		*mp;
2452 	boolean_t	dropped = B_FALSE;
2453 	ip_stack_t	*ipst = ill->ill_ipst;
2454 
2455 	/*
2456 	 * The timer has to be cancelled by ndp_delete before doing the final
2457 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2458 	 * until it clears the timeout_id. Before clearing the timeout_id
2459 	 * bump up the refcnt so that we can continue to use the nce
2460 	 */
2461 	ASSERT(nce != NULL);
2462 
2463 	/*
2464 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2465 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2466 	 */
2467 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2468 	mutex_enter(&nce->nce_lock);
2469 	NCE_REFHOLD_LOCKED(nce);
2470 	nce->nce_timeout_id = 0;
2471 
2472 	/*
2473 	 * Check the reachability state first.
2474 	 */
2475 	switch (nce->nce_state) {
2476 	case ND_DELAY:
2477 		rw_exit(&ipst->ips_ill_g_lock);
2478 		nce->nce_state = ND_PROBE;
2479 		mutex_exit(&nce->nce_lock);
2480 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2481 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2482 		if (ip_debug > 3) {
2483 			/* ip2dbg */
2484 			pr_addr_dbg("ndp_timer: state for %s changed "
2485 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2486 		}
2487 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2488 		NCE_REFRELE(nce);
2489 		return;
2490 	case ND_PROBE:
2491 		/* must be retransmit timer */
2492 		rw_exit(&ipst->ips_ill_g_lock);
2493 		nce->nce_pcnt--;
2494 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2495 		    nce->nce_pcnt >= -1);
2496 		if (nce->nce_pcnt > 0) {
2497 			/*
2498 			 * As per RFC2461, the nce gets deleted after
2499 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2500 			 * Note that the first unicast solicitation is sent
2501 			 * during the DELAY state.
2502 			 */
2503 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2504 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2505 			    addrbuf, sizeof (addrbuf))));
2506 			mutex_exit(&nce->nce_lock);
2507 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2508 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2509 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2510 			    NDP_UNICAST);
2511 			if (dropped) {
2512 				mutex_enter(&nce->nce_lock);
2513 				nce->nce_pcnt++;
2514 				mutex_exit(&nce->nce_lock);
2515 			}
2516 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2517 		} else if (nce->nce_pcnt < 0) {
2518 			/* No hope, delete the nce */
2519 			nce->nce_state = ND_UNREACHABLE;
2520 			mutex_exit(&nce->nce_lock);
2521 			if (ip_debug > 2) {
2522 				/* ip1dbg */
2523 				pr_addr_dbg("ndp_timer: Delete IRE for"
2524 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2525 			}
2526 			ndp_delete(nce);
2527 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2528 			/* Wait RetransTimer, before deleting the entry */
2529 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2530 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2531 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2532 			mutex_exit(&nce->nce_lock);
2533 			/* Wait one interval before killing */
2534 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2535 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2536 			ipif_t *ipif;
2537 
2538 			/*
2539 			 * We're done probing, and we can now declare this
2540 			 * address to be usable.  Let IP know that it's ok to
2541 			 * use.
2542 			 */
2543 			nce->nce_state = ND_REACHABLE;
2544 			mutex_exit(&nce->nce_lock);
2545 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2546 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2547 			if (ipif != NULL) {
2548 				if (ipif->ipif_was_dup) {
2549 					char ibuf[LIFNAMSIZ + 10];
2550 					char sbuf[INET6_ADDRSTRLEN];
2551 
2552 					ipif->ipif_was_dup = B_FALSE;
2553 					(void) inet_ntop(AF_INET6,
2554 					    &ipif->ipif_v6lcl_addr,
2555 					    sbuf, sizeof (sbuf));
2556 					ipif_get_name(ipif, ibuf,
2557 					    sizeof (ibuf));
2558 					cmn_err(CE_NOTE, "recovered address "
2559 					    "%s on %s", sbuf, ibuf);
2560 				}
2561 				if ((ipif->ipif_flags & IPIF_UP) &&
2562 				    !ipif->ipif_addr_ready)
2563 					ipif_up_notify(ipif);
2564 				ipif->ipif_addr_ready = 1;
2565 				ipif_refrele(ipif);
2566 			}
2567 			/* Begin defending our new address */
2568 			nce->nce_unsolicit_count = 0;
2569 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2570 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2571 			    nce_advert_flags(nce));
2572 			if (dropped) {
2573 				nce->nce_unsolicit_count = 1;
2574 				NDP_RESTART_TIMER(nce,
2575 				    ipst->ips_ip_ndp_unsolicit_interval);
2576 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2577 				NDP_RESTART_TIMER(nce,
2578 				    ipst->ips_ip_ndp_defense_interval);
2579 			}
2580 		} else {
2581 			/*
2582 			 * This is an address we're probing to be our own, but
2583 			 * the ill is down.  Wait until it comes back before
2584 			 * doing anything, but switch to reachable state so
2585 			 * that the restart will work.
2586 			 */
2587 			nce->nce_state = ND_REACHABLE;
2588 			mutex_exit(&nce->nce_lock);
2589 		}
2590 		NCE_REFRELE(nce);
2591 		return;
2592 	case ND_INCOMPLETE:
2593 		/*
2594 		 * Must be resolvers retransmit timer.
2595 		 */
2596 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2597 			ip6i_t	*ip6i;
2598 			ip6_t	*ip6h;
2599 			mblk_t *data_mp;
2600 
2601 			/*
2602 			 * Walk the list of packets queued, and see if there
2603 			 * are any multipathing probe packets. Such packets
2604 			 * are always queued at the head. Since this is a
2605 			 * retransmit timer firing, mark such packets as
2606 			 * delayed in ND resolution. This info will be used
2607 			 * in ip_wput_v6(). Multipathing probe packets will
2608 			 * always have an ip6i_t. Once we hit a packet without
2609 			 * it, we can break out of this loop.
2610 			 */
2611 			if (mp->b_datap->db_type == M_CTL)
2612 				data_mp = mp->b_cont;
2613 			else
2614 				data_mp = mp;
2615 
2616 			ip6h = (ip6_t *)data_mp->b_rptr;
2617 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2618 				break;
2619 
2620 			/*
2621 			 * This message should have been pulled up already in
2622 			 * ip_wput_v6. We can't do pullups here because the
2623 			 * b_next/b_prev is non-NULL.
2624 			 */
2625 			ip6i = (ip6i_t *)ip6h;
2626 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2627 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2628 
2629 			/* Mark this packet as delayed due to ND resolution */
2630 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2631 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2632 		}
2633 		if (nce->nce_qd_mp != NULL) {
2634 			ms = nce_solicit(nce, NULL);
2635 			rw_exit(&ipst->ips_ill_g_lock);
2636 			if (ms == 0) {
2637 				if (nce->nce_state != ND_REACHABLE) {
2638 					mutex_exit(&nce->nce_lock);
2639 					nce_resolv_failed(nce);
2640 					ndp_delete(nce);
2641 				} else {
2642 					mutex_exit(&nce->nce_lock);
2643 				}
2644 			} else {
2645 				mutex_exit(&nce->nce_lock);
2646 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2647 			}
2648 			NCE_REFRELE(nce);
2649 			return;
2650 		}
2651 		mutex_exit(&nce->nce_lock);
2652 		rw_exit(&ipst->ips_ill_g_lock);
2653 		NCE_REFRELE(nce);
2654 		break;
2655 	case ND_REACHABLE :
2656 		rw_exit(&ipst->ips_ill_g_lock);
2657 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2658 		    nce->nce_unsolicit_count != 0) ||
2659 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2660 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2661 			if (nce->nce_unsolicit_count > 0)
2662 				nce->nce_unsolicit_count--;
2663 			mutex_exit(&nce->nce_lock);
2664 			dropped = nce_xmit(ill,
2665 			    ND_NEIGHBOR_ADVERT,
2666 			    ill,	/* ill to be used for hw addr */
2667 			    B_FALSE,	/* use ill_phys_addr */
2668 			    &nce->nce_addr,
2669 			    &ipv6_all_hosts_mcast,
2670 			    nce_advert_flags(nce));
2671 			if (dropped) {
2672 				mutex_enter(&nce->nce_lock);
2673 				nce->nce_unsolicit_count++;
2674 				mutex_exit(&nce->nce_lock);
2675 			}
2676 			if (nce->nce_unsolicit_count != 0) {
2677 				NDP_RESTART_TIMER(nce,
2678 				    ipst->ips_ip_ndp_unsolicit_interval);
2679 			} else {
2680 				NDP_RESTART_TIMER(nce,
2681 				    ipst->ips_ip_ndp_defense_interval);
2682 			}
2683 		} else {
2684 			mutex_exit(&nce->nce_lock);
2685 		}
2686 		NCE_REFRELE(nce);
2687 		break;
2688 	default:
2689 		rw_exit(&ipst->ips_ill_g_lock);
2690 		mutex_exit(&nce->nce_lock);
2691 		NCE_REFRELE(nce);
2692 		break;
2693 	}
2694 }
2695 
2696 /*
2697  * Set a link layer address from the ll_addr passed in.
2698  * Copy SAP from ill.
2699  */
2700 static void
2701 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2702 {
2703 	ill_t	*ill = nce->nce_ill;
2704 	uchar_t	*woffset;
2705 
2706 	ASSERT(ll_addr != NULL);
2707 	/* Always called before fast_path_probe */
2708 	ASSERT(nce->nce_fp_mp == NULL);
2709 	if (ill->ill_sap_length != 0) {
2710 		/*
2711 		 * Copy the SAP type specified in the
2712 		 * request into the xmit template.
2713 		 */
2714 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2715 	}
2716 	if (ill->ill_phys_addr_length > 0) {
2717 		/*
2718 		 * The bcopy() below used to be called for the physical address
2719 		 * length rather than the link layer address length. For
2720 		 * ethernet and many other media, the phys_addr and lla are
2721 		 * identical.
2722 		 * However, with xresolv interfaces being introduced, the
2723 		 * phys_addr and lla are no longer the same, and the physical
2724 		 * address may not have any useful meaning, so we use the lla
2725 		 * for IPv6 address resolution and destination addressing.
2726 		 *
2727 		 * For PPP or other interfaces with a zero length
2728 		 * physical address, don't do anything here.
2729 		 * The bcopy() with a zero phys_addr length was previously
2730 		 * a no-op for interfaces with a zero-length physical address.
2731 		 * Using the lla for them would change the way they operate.
2732 		 * Doing nothing in such cases preserves expected behavior.
2733 		 */
2734 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2735 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2736 	}
2737 }
2738 
2739 static boolean_t
2740 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2741 {
2742 	ill_t	*ill = nce->nce_ill;
2743 	uchar_t	*ll_offset;
2744 
2745 	ASSERT(nce->nce_res_mp != NULL);
2746 	if (ll_addr == NULL)
2747 		return (B_FALSE);
2748 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2749 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2750 		return (B_TRUE);
2751 	return (B_FALSE);
2752 }
2753 
2754 /*
2755  * Updates the link layer address or the reachability state of
2756  * a cache entry.  Reset probe counter if needed.
2757  */
2758 static void
2759 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2760 {
2761 	ill_t	*ill = nce->nce_ill;
2762 	boolean_t need_stop_timer = B_FALSE;
2763 	boolean_t need_fastpath_update = B_FALSE;
2764 
2765 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2766 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2767 	/*
2768 	 * If this interface does not do NUD, there is no point
2769 	 * in allowing an update to the cache entry.  Although
2770 	 * we will respond to NS.
2771 	 * The only time we accept an update for a resolver when
2772 	 * NUD is turned off is when it has just been created.
2773 	 * Non-Resolvers will always be created as REACHABLE.
2774 	 */
2775 	if (new_state != ND_UNCHANGED) {
2776 		if ((nce->nce_flags & NCE_F_NONUD) &&
2777 		    (nce->nce_state != ND_INCOMPLETE))
2778 			return;
2779 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2780 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2781 		need_stop_timer = B_TRUE;
2782 		if (new_state == ND_REACHABLE)
2783 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2784 		else {
2785 			/* We force NUD in this case */
2786 			nce->nce_last = 0;
2787 		}
2788 		nce->nce_state = new_state;
2789 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2790 	}
2791 	/*
2792 	 * In case of fast path we need to free the the fastpath
2793 	 * M_DATA and do another probe.  Otherwise we can just
2794 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2795 	 * whatever packets that happens to be transmitting at the time.
2796 	 */
2797 	if (new_ll_addr != NULL) {
2798 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2799 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2800 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2801 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2802 		if (nce->nce_fp_mp != NULL) {
2803 			freemsg(nce->nce_fp_mp);
2804 			nce->nce_fp_mp = NULL;
2805 		}
2806 		need_fastpath_update = B_TRUE;
2807 	}
2808 	mutex_exit(&nce->nce_lock);
2809 	if (need_stop_timer) {
2810 		(void) untimeout(nce->nce_timeout_id);
2811 		nce->nce_timeout_id = 0;
2812 	}
2813 	if (need_fastpath_update)
2814 		nce_fastpath(nce);
2815 	mutex_enter(&nce->nce_lock);
2816 }
2817 
2818 void
2819 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2820 {
2821 	uint_t	count = 0;
2822 	mblk_t  **mpp;
2823 
2824 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2825 
2826 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2827 	    mpp = &(*mpp)->b_next) {
2828 		if (++count >
2829 		    nce->nce_ill->ill_max_buf) {
2830 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2831 
2832 			nce->nce_qd_mp->b_next = NULL;
2833 			nce->nce_qd_mp->b_prev = NULL;
2834 			freemsg(nce->nce_qd_mp);
2835 			nce->nce_qd_mp = tmp;
2836 		}
2837 	}
2838 	/* put this on the list */
2839 	if (head_insert) {
2840 		mp->b_next = nce->nce_qd_mp;
2841 		nce->nce_qd_mp = mp;
2842 	} else {
2843 		*mpp = mp;
2844 	}
2845 }
2846 
2847 static void
2848 nce_queue_mp(nce_t *nce, mblk_t *mp)
2849 {
2850 	boolean_t head_insert = B_FALSE;
2851 	ip6_t	*ip6h;
2852 	ip6i_t	*ip6i;
2853 	mblk_t *data_mp;
2854 
2855 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2856 
2857 	if (mp->b_datap->db_type == M_CTL)
2858 		data_mp = mp->b_cont;
2859 	else
2860 		data_mp = mp;
2861 	ip6h = (ip6_t *)data_mp->b_rptr;
2862 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2863 		/*
2864 		 * This message should have been pulled up already in
2865 		 * ip_wput_v6. We can't do pullups here because the message
2866 		 * could be from the nce_qd_mp which could have b_next/b_prev
2867 		 * non-NULL.
2868 		 */
2869 		ip6i = (ip6i_t *)ip6h;
2870 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2871 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
2872 		/*
2873 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2874 		 * This has 2 aspects mentioned below.
2875 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2876 		 * This ensures that next retransmit of ND solicitation
2877 		 * will use the interface specified by the probe packet,
2878 		 * for both NS and NA. This corresponds to the src address
2879 		 * in the IPv6 packet. If we insert at tail, we will be
2880 		 * depending on the packet at the head for successful
2881 		 * ND resolution. This is not reliable, because the interface
2882 		 * on which the NA arrives could be different from the interface
2883 		 * on which the NS was sent, and if the receiving interface is
2884 		 * failed, it will appear that the sending interface is also
2885 		 * failed, causing in.mpathd to misdiagnose this as link
2886 		 * failure.
2887 		 * 2. Drop the original packet, if the ND resolution did not
2888 		 * succeed in the first attempt. However we will create the
2889 		 * nce and the ire, as soon as the ND resolution succeeds.
2890 		 * We don't gain anything by queueing multiple probe packets
2891 		 * and sending them back-to-back once resolution succeeds.
2892 		 * It is sufficient to send just 1 packet after ND resolution
2893 		 * succeeds. Since mpathd is sending down probe packets at a
2894 		 * constant rate, we don't need to send the queued packet. We
2895 		 * need to queue it only for NDP resolution. The benefit of
2896 		 * dropping the probe packets that were delayed in ND
2897 		 * resolution, is that in.mpathd will not see inflated
2898 		 * RTT. If the ND resolution does not succeed within
2899 		 * in.mpathd's failure detection time, mpathd may detect
2900 		 * a failure, and it does not matter whether the packet
2901 		 * was queued or dropped.
2902 		 */
2903 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2904 			head_insert = B_TRUE;
2905 	}
2906 
2907 	nce_queue_mp_common(nce, mp, head_insert);
2908 }
2909 
2910 /*
2911  * Called when address resolution failed due to a timeout.
2912  * Send an ICMP unreachable in response to all queued packets.
2913  */
2914 void
2915 nce_resolv_failed(nce_t *nce)
2916 {
2917 	mblk_t	*mp, *nxt_mp, *first_mp;
2918 	char	buf[INET6_ADDRSTRLEN];
2919 	ip6_t *ip6h;
2920 	zoneid_t zoneid = GLOBAL_ZONEID;
2921 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
2922 
2923 	ip1dbg(("nce_resolv_failed: dst %s\n",
2924 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2925 	mutex_enter(&nce->nce_lock);
2926 	mp = nce->nce_qd_mp;
2927 	nce->nce_qd_mp = NULL;
2928 	mutex_exit(&nce->nce_lock);
2929 	while (mp != NULL) {
2930 		nxt_mp = mp->b_next;
2931 		mp->b_next = NULL;
2932 		mp->b_prev = NULL;
2933 
2934 		first_mp = mp;
2935 		if (mp->b_datap->db_type == M_CTL) {
2936 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2937 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2938 			zoneid = io->ipsec_out_zoneid;
2939 			ASSERT(zoneid != ALL_ZONES);
2940 			mp = mp->b_cont;
2941 			mp->b_next = NULL;
2942 			mp->b_prev = NULL;
2943 		}
2944 
2945 		ip6h = (ip6_t *)mp->b_rptr;
2946 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2947 			ip6i_t *ip6i;
2948 			/*
2949 			 * This message should have been pulled up already
2950 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2951 			 * the header is pulled up.
2952 			 */
2953 			ip6i = (ip6i_t *)ip6h;
2954 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2955 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2956 			mp->b_rptr += sizeof (ip6i_t);
2957 		}
2958 		/*
2959 		 * Ignore failure since icmp_unreachable_v6 will silently
2960 		 * drop packets with an unspecified source address.
2961 		 */
2962 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
2963 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2964 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
2965 		mp = nxt_mp;
2966 	}
2967 }
2968 
2969 /*
2970  * Called by SIOCSNDP* ioctl to add/change an nce entry
2971  * and the corresponding attributes.
2972  * Disallow states other than ND_REACHABLE or ND_STALE.
2973  */
2974 int
2975 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2976 {
2977 	sin6_t		*sin6;
2978 	in6_addr_t	*addr;
2979 	nce_t		*nce;
2980 	int		err;
2981 	uint16_t	new_flags = 0;
2982 	uint16_t	old_flags = 0;
2983 	int		inflags = lnr->lnr_flags;
2984 	ip_stack_t	*ipst = ill->ill_ipst;
2985 
2986 	ASSERT(ill->ill_isv6);
2987 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2988 	    (lnr->lnr_state_create != ND_STALE))
2989 		return (EINVAL);
2990 
2991 	sin6 = (sin6_t *)&lnr->lnr_addr;
2992 	addr = &sin6->sin6_addr;
2993 
2994 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2995 	/* We know it can not be mapping so just look in the hash table */
2996 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
2997 	nce = nce_lookup_addr(ill, addr, nce);
2998 	if (nce != NULL)
2999 		new_flags = nce->nce_flags;
3000 
3001 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3002 	case NDF_ISROUTER_ON:
3003 		new_flags |= NCE_F_ISROUTER;
3004 		break;
3005 	case NDF_ISROUTER_OFF:
3006 		new_flags &= ~NCE_F_ISROUTER;
3007 		break;
3008 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3009 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3010 		if (nce != NULL)
3011 			NCE_REFRELE(nce);
3012 		return (EINVAL);
3013 	}
3014 
3015 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3016 	case NDF_ANYCAST_ON:
3017 		new_flags |= NCE_F_ANYCAST;
3018 		break;
3019 	case NDF_ANYCAST_OFF:
3020 		new_flags &= ~NCE_F_ANYCAST;
3021 		break;
3022 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3023 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3024 		if (nce != NULL)
3025 			NCE_REFRELE(nce);
3026 		return (EINVAL);
3027 	}
3028 
3029 	if (nce == NULL) {
3030 		err = ndp_add_v6(ill,
3031 		    (uchar_t *)lnr->lnr_hdw_addr,
3032 		    addr,
3033 		    &ipv6_all_ones,
3034 		    &ipv6_all_zeros,
3035 		    0,
3036 		    new_flags,
3037 		    lnr->lnr_state_create,
3038 		    &nce);
3039 		if (err != 0) {
3040 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3041 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3042 			return (err);
3043 		}
3044 	}
3045 	old_flags = nce->nce_flags;
3046 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3047 		/*
3048 		 * Router turned to host, delete all ires.
3049 		 * XXX Just delete the entry, but we need to add too.
3050 		 */
3051 		nce->nce_flags &= ~NCE_F_ISROUTER;
3052 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3053 		ndp_delete(nce);
3054 		NCE_REFRELE(nce);
3055 		return (0);
3056 	}
3057 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3058 
3059 	mutex_enter(&nce->nce_lock);
3060 	nce->nce_flags = new_flags;
3061 	mutex_exit(&nce->nce_lock);
3062 	/*
3063 	 * Note that we ignore the state at this point, which
3064 	 * should be either STALE or REACHABLE.  Instead we let
3065 	 * the link layer address passed in to determine the state
3066 	 * much like incoming packets.
3067 	 */
3068 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3069 	NCE_REFRELE(nce);
3070 	return (0);
3071 }
3072 
3073 /*
3074  * If the device driver supports it, we make nce_fp_mp to have
3075  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3076  * The caller ensures there is hold on nce for this function.
3077  * Note that since ill_fastpath_probe() copies the mblk there is
3078  * no need for the hold beyond this function.
3079  */
3080 void
3081 nce_fastpath(nce_t *nce)
3082 {
3083 	ill_t	*ill = nce->nce_ill;
3084 	int res;
3085 
3086 	ASSERT(ill != NULL);
3087 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3088 
3089 	if (nce->nce_fp_mp != NULL) {
3090 		/* Already contains fastpath info */
3091 		return;
3092 	}
3093 	if (nce->nce_res_mp != NULL) {
3094 		nce_fastpath_list_add(nce);
3095 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3096 		/*
3097 		 * EAGAIN is an indication of a transient error
3098 		 * i.e. allocation failure etc. leave the nce in the list it
3099 		 * will be updated when another probe happens for another ire
3100 		 * if not it will be taken out of the list when the ire is
3101 		 * deleted.
3102 		 */
3103 
3104 		if (res != 0 && res != EAGAIN)
3105 			nce_fastpath_list_delete(nce);
3106 	}
3107 }
3108 
3109 /*
3110  * Drain the list of nce's waiting for fastpath response.
3111  */
3112 void
3113 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3114     void *arg)
3115 {
3116 
3117 	nce_t *next_nce;
3118 	nce_t *current_nce;
3119 	nce_t *first_nce;
3120 	nce_t *prev_nce = NULL;
3121 
3122 	mutex_enter(&ill->ill_lock);
3123 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3124 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3125 		next_nce = current_nce->nce_fastpath;
3126 		/*
3127 		 * Take it off the list if we're flushing, or if the callback
3128 		 * routine tells us to do so.  Otherwise, leave the nce in the
3129 		 * fastpath list to handle any pending response from the lower
3130 		 * layer.  We can't drain the list when the callback routine
3131 		 * comparison failed, because the response is asynchronous in
3132 		 * nature, and may not arrive in the same order as the list
3133 		 * insertion.
3134 		 */
3135 		if (func == NULL || func(current_nce, arg)) {
3136 			current_nce->nce_fastpath = NULL;
3137 			if (current_nce == first_nce)
3138 				ill->ill_fastpath_list = first_nce = next_nce;
3139 			else
3140 				prev_nce->nce_fastpath = next_nce;
3141 		} else {
3142 			/* previous element that is still in the list */
3143 			prev_nce = current_nce;
3144 		}
3145 		current_nce = next_nce;
3146 	}
3147 	mutex_exit(&ill->ill_lock);
3148 }
3149 
3150 /*
3151  * Add nce to the nce fastpath list.
3152  */
3153 void
3154 nce_fastpath_list_add(nce_t *nce)
3155 {
3156 	ill_t *ill;
3157 
3158 	ill = nce->nce_ill;
3159 
3160 	mutex_enter(&ill->ill_lock);
3161 	mutex_enter(&nce->nce_lock);
3162 
3163 	/*
3164 	 * if nce has not been deleted and
3165 	 * is not already in the list add it.
3166 	 */
3167 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3168 	    (nce->nce_fastpath == NULL)) {
3169 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3170 		ill->ill_fastpath_list = nce;
3171 	}
3172 
3173 	mutex_exit(&nce->nce_lock);
3174 	mutex_exit(&ill->ill_lock);
3175 }
3176 
3177 /*
3178  * remove nce from the nce fastpath list.
3179  */
3180 void
3181 nce_fastpath_list_delete(nce_t *nce)
3182 {
3183 	nce_t *nce_ptr;
3184 
3185 	ill_t *ill;
3186 
3187 	ill = nce->nce_ill;
3188 	ASSERT(ill != NULL);
3189 
3190 	mutex_enter(&ill->ill_lock);
3191 	if (nce->nce_fastpath == NULL)
3192 		goto done;
3193 
3194 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3195 
3196 	if (ill->ill_fastpath_list == nce) {
3197 		ill->ill_fastpath_list = nce->nce_fastpath;
3198 	} else {
3199 		nce_ptr = ill->ill_fastpath_list;
3200 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3201 			if (nce_ptr->nce_fastpath == nce) {
3202 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3203 				break;
3204 			}
3205 			nce_ptr = nce_ptr->nce_fastpath;
3206 		}
3207 	}
3208 
3209 	nce->nce_fastpath = NULL;
3210 done:
3211 	mutex_exit(&ill->ill_lock);
3212 }
3213 
3214 /*
3215  * Update all NCE's that are not in fastpath mode and
3216  * have an nce_fp_mp that matches mp. mp->b_cont contains
3217  * the fastpath header.
3218  *
3219  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3220  */
3221 boolean_t
3222 ndp_fastpath_update(nce_t *nce, void *arg)
3223 {
3224 	mblk_t 	*mp, *fp_mp;
3225 	uchar_t	*mp_rptr, *ud_mp_rptr;
3226 	mblk_t	*ud_mp = nce->nce_res_mp;
3227 	ptrdiff_t	cmplen;
3228 
3229 	if (nce->nce_flags & NCE_F_MAPPING)
3230 		return (B_TRUE);
3231 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3232 		return (B_TRUE);
3233 
3234 	ip2dbg(("ndp_fastpath_update: trying\n"));
3235 	mp = (mblk_t *)arg;
3236 	mp_rptr = mp->b_rptr;
3237 	cmplen = mp->b_wptr - mp_rptr;
3238 	ASSERT(cmplen >= 0);
3239 	ud_mp_rptr = ud_mp->b_rptr;
3240 	/*
3241 	 * The nce is locked here to prevent any other threads
3242 	 * from accessing and changing nce_res_mp when the IPv6 address
3243 	 * becomes resolved to an lla while we're in the middle
3244 	 * of looking at and comparing the hardware address (lla).
3245 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3246 	 * from examining nce_res_mp atthe same time.
3247 	 */
3248 	mutex_enter(&nce->nce_lock);
3249 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3250 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3251 		mutex_exit(&nce->nce_lock);
3252 		/*
3253 		 * Don't take the ire off the fastpath list yet,
3254 		 * since the response may come later.
3255 		 */
3256 		return (B_FALSE);
3257 	}
3258 	/* Matched - install mp as the fastpath mp */
3259 	ip1dbg(("ndp_fastpath_update: match\n"));
3260 	fp_mp = dupb(mp->b_cont);
3261 	if (fp_mp != NULL) {
3262 		nce->nce_fp_mp = fp_mp;
3263 	}
3264 	mutex_exit(&nce->nce_lock);
3265 	return (B_TRUE);
3266 }
3267 
3268 /*
3269  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3270  * driver.  Note that it assumes IP is exclusive...
3271  */
3272 /* ARGSUSED */
3273 void
3274 ndp_fastpath_flush(nce_t *nce, char *arg)
3275 {
3276 	if (nce->nce_flags & NCE_F_MAPPING)
3277 		return;
3278 	/* No fastpath info? */
3279 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3280 		return;
3281 
3282 	if (nce->nce_ipversion == IPV4_VERSION &&
3283 	    nce->nce_flags & NCE_F_BCAST) {
3284 		/*
3285 		 * IPv4 BROADCAST entries:
3286 		 * We can't delete the nce since it is difficult to
3287 		 * recreate these without going through the
3288 		 * ipif down/up dance.
3289 		 *
3290 		 * All access to nce->nce_fp_mp in the case of these
3291 		 * is protected by nce_lock.
3292 		 */
3293 		mutex_enter(&nce->nce_lock);
3294 		if (nce->nce_fp_mp != NULL) {
3295 			freeb(nce->nce_fp_mp);
3296 			nce->nce_fp_mp = NULL;
3297 			mutex_exit(&nce->nce_lock);
3298 			nce_fastpath(nce);
3299 		} else {
3300 			mutex_exit(&nce->nce_lock);
3301 		}
3302 	} else {
3303 		/* Just delete the NCE... */
3304 		ndp_delete(nce);
3305 	}
3306 }
3307 
3308 /*
3309  * Return a pointer to a given option in the packet.
3310  * Assumes that option part of the packet have already been validated.
3311  */
3312 nd_opt_hdr_t *
3313 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3314 {
3315 	while (optlen > 0) {
3316 		if (opt->nd_opt_type == opt_type)
3317 			return (opt);
3318 		optlen -= 8 * opt->nd_opt_len;
3319 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3320 	}
3321 	return (NULL);
3322 }
3323 
3324 /*
3325  * Verify all option lengths present are > 0, also check to see
3326  * if the option lengths and packet length are consistent.
3327  */
3328 boolean_t
3329 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3330 {
3331 	ASSERT(opt != NULL);
3332 	while (optlen > 0) {
3333 		if (opt->nd_opt_len == 0)
3334 			return (B_FALSE);
3335 		optlen -= 8 * opt->nd_opt_len;
3336 		if (optlen < 0)
3337 			return (B_FALSE);
3338 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3339 	}
3340 	return (B_TRUE);
3341 }
3342 
3343 /*
3344  * ndp_walk function.
3345  * Free a fraction of the NCE cache entries.
3346  * A fraction of zero means to not free any in that category.
3347  */
3348 void
3349 ndp_cache_reclaim(nce_t *nce, char *arg)
3350 {
3351 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3352 	uint_t	rand;
3353 
3354 	if (nce->nce_flags & NCE_F_PERMANENT)
3355 		return;
3356 
3357 	rand = (uint_t)lbolt +
3358 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3359 	if (ncr->ncr_host != 0 &&
3360 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3361 		ndp_delete(nce);
3362 		return;
3363 	}
3364 }
3365 
3366 /*
3367  * ndp_walk function.
3368  * Count the number of NCEs that can be deleted.
3369  * These would be hosts but not routers.
3370  */
3371 void
3372 ndp_cache_count(nce_t *nce, char *arg)
3373 {
3374 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3375 
3376 	if (nce->nce_flags & NCE_F_PERMANENT)
3377 		return;
3378 
3379 	ncc->ncc_total++;
3380 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3381 		ncc->ncc_host++;
3382 }
3383 
3384 #ifdef DEBUG
3385 void
3386 nce_trace_ref(nce_t *nce)
3387 {
3388 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3389 
3390 	if (nce->nce_trace_disable)
3391 		return;
3392 
3393 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3394 		nce->nce_trace_disable = B_TRUE;
3395 		nce_trace_cleanup(nce);
3396 	}
3397 }
3398 
3399 void
3400 nce_untrace_ref(nce_t *nce)
3401 {
3402 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3403 
3404 	if (!nce->nce_trace_disable)
3405 		th_trace_unref(nce);
3406 }
3407 
3408 static void
3409 nce_trace_cleanup(const nce_t *nce)
3410 {
3411 	th_trace_cleanup(nce, nce->nce_trace_disable);
3412 }
3413 #endif
3414 
3415 /*
3416  * Called when address resolution fails due to a timeout.
3417  * Send an ICMP unreachable in response to all queued packets.
3418  */
3419 void
3420 arp_resolv_failed(nce_t *nce)
3421 {
3422 	mblk_t	*mp, *nxt_mp, *first_mp;
3423 	char	buf[INET6_ADDRSTRLEN];
3424 	zoneid_t zoneid = GLOBAL_ZONEID;
3425 	struct in_addr ipv4addr;
3426 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3427 
3428 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3429 	ip3dbg(("arp_resolv_failed: dst %s\n",
3430 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3431 	mutex_enter(&nce->nce_lock);
3432 	mp = nce->nce_qd_mp;
3433 	nce->nce_qd_mp = NULL;
3434 	mutex_exit(&nce->nce_lock);
3435 
3436 	while (mp != NULL) {
3437 		nxt_mp = mp->b_next;
3438 		mp->b_next = NULL;
3439 		mp->b_prev = NULL;
3440 
3441 		first_mp = mp;
3442 		/*
3443 		 * Send icmp unreachable messages
3444 		 * to the hosts.
3445 		 */
3446 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3447 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3448 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3449 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3450 		mp = nxt_mp;
3451 	}
3452 }
3453 
3454 int
3455 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3456     nce_t **newnce, nce_t *src_nce)
3457 {
3458 	int	err;
3459 	nce_t	*nce;
3460 	in6_addr_t addr6;
3461 	ip_stack_t *ipst = ill->ill_ipst;
3462 
3463 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3464 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3465 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3466 	nce = nce_lookup_addr(ill, &addr6, nce);
3467 	if (nce == NULL) {
3468 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3469 	} else {
3470 		*newnce = nce;
3471 		err = EEXIST;
3472 	}
3473 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3474 	return (err);
3475 }
3476 
3477 /*
3478  * NDP Cache Entry creation routine for IPv4.
3479  * Mapped entries are handled in arp.
3480  * This routine must always be called with ndp4->ndp_g_lock held.
3481  * Prior to return, nce_refcnt is incremented.
3482  */
3483 static int
3484 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3485     nce_t **newnce, nce_t *src_nce)
3486 {
3487 	static	nce_t		nce_nil;
3488 	nce_t		*nce;
3489 	mblk_t		*mp;
3490 	mblk_t		*template = NULL;
3491 	nce_t		**ncep;
3492 	ip_stack_t	*ipst = ill->ill_ipst;
3493 	uint16_t	state = ND_INITIAL;
3494 	int		err;
3495 
3496 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3497 	ASSERT(!ill->ill_isv6);
3498 	ASSERT((flags & NCE_F_MAPPING) == 0);
3499 
3500 	if (ill->ill_resolver_mp == NULL)
3501 		return (EINVAL);
3502 	/*
3503 	 * Allocate the mblk to hold the nce.
3504 	 */
3505 	mp = allocb(sizeof (nce_t), BPRI_MED);
3506 	if (mp == NULL)
3507 		return (ENOMEM);
3508 
3509 	nce = (nce_t *)mp->b_rptr;
3510 	mp->b_wptr = (uchar_t *)&nce[1];
3511 	*nce = nce_nil;
3512 	nce->nce_ill = ill;
3513 	nce->nce_ipversion = IPV4_VERSION;
3514 	nce->nce_flags = flags;
3515 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3516 	nce->nce_rcnt = ill->ill_xmit_count;
3517 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3518 	nce->nce_mask = ipv6_all_ones;
3519 	nce->nce_extract_mask = ipv6_all_zeros;
3520 	nce->nce_ll_extract_start = 0;
3521 	nce->nce_qd_mp = NULL;
3522 	nce->nce_mp = mp;
3523 	/* This one is for nce getting created */
3524 	nce->nce_refcnt = 1;
3525 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3526 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3527 
3528 	nce->nce_trace_disable = B_FALSE;
3529 
3530 	if (src_nce != NULL) {
3531 		/*
3532 		 * src_nce has been provided by the caller. The only
3533 		 * caller who provides a non-null, non-broadcast
3534 		 * src_nce is from ip_newroute() which must pass in
3535 		 * a ND_REACHABLE src_nce (this condition is verified
3536 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3537 		 */
3538 		mutex_enter(&src_nce->nce_lock);
3539 		state = src_nce->nce_state;
3540 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3541 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3542 			/*
3543 			 * src_nce has been deleted, or
3544 			 * ip_arp_news is in the middle of
3545 			 * flushing entries in the the nce.
3546 			 * Fail the add, since we don't know
3547 			 * if it is safe to copy the contents of
3548 			 * src_nce
3549 			 */
3550 			DTRACE_PROBE2(nce__bad__src__nce,
3551 			    nce_t *, src_nce, ill_t *, ill);
3552 			mutex_exit(&src_nce->nce_lock);
3553 			err = EINVAL;
3554 			goto err_ret;
3555 		}
3556 		template = copyb(src_nce->nce_res_mp);
3557 		mutex_exit(&src_nce->nce_lock);
3558 		if (template == NULL) {
3559 			err = ENOMEM;
3560 			goto err_ret;
3561 		}
3562 	} else if (flags & NCE_F_BCAST) {
3563 		/*
3564 		 * broadcast nce.
3565 		 */
3566 		template = copyb(ill->ill_bcast_mp);
3567 		if (template == NULL) {
3568 			err = ENOMEM;
3569 			goto err_ret;
3570 		}
3571 		state = ND_REACHABLE;
3572 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3573 		/*
3574 		 * NORESOLVER entries are always created in the REACHABLE
3575 		 * state. We create a nce_res_mp with the IP nexthop address
3576 		 * in the destination address in the DLPI hdr if the
3577 		 * physical length is exactly 4 bytes.
3578 		 *
3579 		 * XXX not clear which drivers set ill_phys_addr_length to
3580 		 * IP_ADDR_LEN.
3581 		 */
3582 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3583 			template = ill_dlur_gen((uchar_t *)addr,
3584 			    ill->ill_phys_addr_length,
3585 			    ill->ill_sap, ill->ill_sap_length);
3586 		} else {
3587 			template = copyb(ill->ill_resolver_mp);
3588 		}
3589 		if (template == NULL) {
3590 			err = ENOMEM;
3591 			goto err_ret;
3592 		}
3593 		state = ND_REACHABLE;
3594 	}
3595 	nce->nce_fp_mp = NULL;
3596 	nce->nce_res_mp = template;
3597 	nce->nce_state = state;
3598 	if (state == ND_REACHABLE) {
3599 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3600 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3601 	} else {
3602 		nce->nce_last = 0;
3603 		if (state == ND_INITIAL)
3604 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3605 	}
3606 
3607 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3608 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3609 	/*
3610 	 * Atomically ensure that the ill is not CONDEMNED, before
3611 	 * adding the NCE.
3612 	 */
3613 	mutex_enter(&ill->ill_lock);
3614 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3615 		mutex_exit(&ill->ill_lock);
3616 		err = EINVAL;
3617 		goto err_ret;
3618 	}
3619 	if ((nce->nce_next = *ncep) != NULL)
3620 		nce->nce_next->nce_ptpn = &nce->nce_next;
3621 	*ncep = nce;
3622 	nce->nce_ptpn = ncep;
3623 	*newnce = nce;
3624 	/* This one is for nce being used by an active thread */
3625 	NCE_REFHOLD(*newnce);
3626 
3627 	/* Bump up the number of nce's referencing this ill */
3628 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3629 	    (char *), "nce", (void *), nce);
3630 	ill->ill_nce_cnt++;
3631 	mutex_exit(&ill->ill_lock);
3632 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3633 	return (0);
3634 err_ret:
3635 	freeb(mp);
3636 	freemsg(template);
3637 	return (err);
3638 }
3639 
3640 /*
3641  * ndp_walk routine to delete all entries that have a given destination or
3642  * gateway address and cached link layer (MAC) address.  This is used when ARP
3643  * informs us that a network-to-link-layer mapping may have changed.
3644  */
3645 void
3646 nce_delete_hw_changed(nce_t *nce, void *arg)
3647 {
3648 	nce_hw_map_t *hwm = arg;
3649 	mblk_t *mp;
3650 	dl_unitdata_req_t *dlu;
3651 	uchar_t *macaddr;
3652 	ill_t *ill;
3653 	int saplen;
3654 	ipaddr_t nce_addr;
3655 
3656 	if (nce->nce_state != ND_REACHABLE)
3657 		return;
3658 
3659 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3660 	if (nce_addr != hwm->hwm_addr)
3661 		return;
3662 
3663 	mutex_enter(&nce->nce_lock);
3664 	if ((mp = nce->nce_res_mp) == NULL) {
3665 		mutex_exit(&nce->nce_lock);
3666 		return;
3667 	}
3668 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3669 	macaddr = (uchar_t *)(dlu + 1);
3670 	ill = nce->nce_ill;
3671 	if ((saplen = ill->ill_sap_length) > 0)
3672 		macaddr += saplen;
3673 	else
3674 		saplen = -saplen;
3675 
3676 	/*
3677 	 * If the hardware address is unchanged, then leave this one alone.
3678 	 * Note that saplen == abs(saplen) now.
3679 	 */
3680 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3681 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3682 		mutex_exit(&nce->nce_lock);
3683 		return;
3684 	}
3685 	mutex_exit(&nce->nce_lock);
3686 
3687 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3688 	ndp_delete(nce);
3689 }
3690 
3691 /*
3692  * This function verifies whether a given IPv4 address is potentially known to
3693  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3694  * so that it can continue to look for hardware changes on that address.
3695  */
3696 boolean_t
3697 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3698 {
3699 	nce_t		*nce;
3700 	struct in_addr	nceaddr;
3701 	ip_stack_t	*ipst = ns->netstack_ip;
3702 
3703 	if (addr == INADDR_ANY)
3704 		return (B_FALSE);
3705 
3706 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3707 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3708 	for (; nce != NULL; nce = nce->nce_next) {
3709 		/* Note that only v4 mapped entries are in the table. */
3710 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3711 		if (addr == nceaddr.s_addr &&
3712 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3713 			/* Single flag check; no lock needed */
3714 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3715 				break;
3716 		}
3717 	}
3718 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3719 	return (nce != NULL);
3720 }
3721