xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision 34e48580)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Data-Link Driver
31  */
32 
33 #include	<sys/stropts.h>
34 #include	<sys/strsun.h>
35 #include	<sys/strsubr.h>
36 #include	<sys/atomic.h>
37 #include	<sys/mkdev.h>
38 #include	<sys/vlan.h>
39 #include	<sys/dld.h>
40 #include	<sys/dld_impl.h>
41 #include	<sys/dls_impl.h>
42 #include	<inet/common.h>
43 
44 static int	str_constructor(void *, void *, int);
45 static void	str_destructor(void *, void *);
46 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *);
47 static void	str_notify_promisc_on_phys(dld_str_t *);
48 static void	str_notify_promisc_off_phys(dld_str_t *);
49 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
50 static void	str_notify_link_up(dld_str_t *);
51 static void	str_notify_link_down(dld_str_t *);
52 static void	str_notify_capab_reneg(dld_str_t *);
53 static void	str_notify_speed(dld_str_t *, uint32_t);
54 static void	str_notify(void *, mac_notify_type_t);
55 
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static minor_t	dld_minor_hold(boolean_t);
61 static void	dld_minor_rele(minor_t);
62 
63 static uint32_t		str_count;
64 static kmem_cache_t	*str_cachep;
65 static vmem_t		*minor_arenap;
66 static uint32_t		minor_count;
67 
68 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
69 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
70 
71 /*
72  * Some notes on entry points, flow-control, queueing and locking:
73  *
74  * This driver exports the traditional STREAMS put entry point as well as
75  * the non-STREAMS fast-path transmit routine which is provided to IP via
76  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
77  * and data operations, while the fast-path routine deals only with M_DATA
78  * fast-path packets.  Regardless of the entry point, all outbound packets
79  * will end up in str_mdata_fastpath_put(), where they will be delivered to
80  * the MAC driver.
81  *
82  * The transmit logic operates in two modes: a "not busy" mode where the
83  * packets will be delivered to the MAC for a send attempt, or "busy" mode
84  * where they will be enqueued in the internal queue because of flow-control.
85  * Flow-control happens when the MAC driver indicates the packets couldn't
86  * be transmitted due to lack of resources (e.g. running out of descriptors).
87  * In such case, the driver will place a dummy message on its write-side
88  * STREAMS queue so that the queue is marked as "full".  Any subsequent
89  * packets arriving at the driver will be enqueued in the internal queue,
90  * which is drained in the context of the service thread that gets scheduled
91  * whenever the driver is in the "busy" mode.  When all packets have been
92  * successfully delivered by MAC and the internal queue is empty, it will
93  * transition to the "not busy" mode by removing the dummy message from the
94  * write-side STREAMS queue; in effect this will trigger backenabling.
95  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
96  * to the above reasons.
97  *
98  * The driver implements an internal transmit queue independent of STREAMS.
99  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
100  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
101  * getq() operations done by the driver are those related to placing and
102  * removing the dummy message to/from the write-side STREAMS queue for flow-
103  * control purposes.
104  *
105  * Locking is done independent of STREAMS due to the driver being fully MT.
106  * Threads entering the driver (either from put or service entry points)
107  * will most likely be readers, with the exception of a few writer cases
108  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
109  * DLD-related ioctl requests.  The DLPI detach case is special, because
110  * it involves freeing resources and therefore must be single-threaded.
111  * Unfortunately the readers/writers lock can't be used to protect against
112  * it, because the lock is dropped prior to the driver calling places where
113  * putnext() may be invoked, and such places may depend on those resources
114  * to exist.  Because of this, the driver always completes the DLPI detach
115  * process when there are no other threads running in the driver.  This is
116  * done by keeping track of the number of threads, such that the the last
117  * thread leaving the driver will finish the pending DLPI detach operation.
118  */
119 
120 /*
121  * dld_max_q_count is the queue depth threshold used to limit the number of
122  * outstanding packets or bytes allowed in the queue; once this limit is
123  * reached the driver will free any incoming ones until the queue depth
124  * drops below the threshold.
125  *
126  * This buffering is provided to accomodate clients which do not employ
127  * their own buffering scheme, and to handle occasional packet bursts.
128  * Clients which handle their own buffering will receive positive feedback
129  * from this driver as soon as it transitions into the "busy" state, i.e.
130  * when the queue is initially filled up; they will get backenabled once
131  * the queue is empty.
132  *
133  * The value chosen here is rather arbitrary; in future some intelligent
134  * heuristics may be involved which could take into account the hardware's
135  * transmit ring size, etc.
136  */
137 uint_t dld_max_q_count = (16 * 1024 *1024);
138 
139 static dev_info_t *
140 dld_finddevinfo(dev_t dev)
141 {
142 	minor_t		minor = getminor(dev);
143 	char		*drvname = ddi_major_to_name(getmajor(dev));
144 	char		name[MAXNAMELEN];
145 	dls_vlan_t	*dvp = NULL;
146 	dev_info_t	*dip = NULL;
147 
148 	if (drvname == NULL || minor == 0 || minor > DLD_MAX_PPA + 1)
149 		return (NULL);
150 
151 	(void) snprintf(name, MAXNAMELEN, "%s%d", drvname, (int)minor - 1);
152 	if (dls_vlan_hold(name, &dvp, B_FALSE) != 0)
153 		return (NULL);
154 
155 	dip = mac_devinfo_get(dvp->dv_dlp->dl_mh);
156 	dls_vlan_rele(dvp);
157 	return (dip);
158 }
159 
160 /*
161  * devo_getinfo: getinfo(9e)
162  */
163 /*ARGSUSED*/
164 int
165 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
166 {
167 	dev_info_t	*devinfo;
168 	minor_t		minor = getminor((dev_t)arg);
169 	int		rc = DDI_FAILURE;
170 
171 	switch (cmd) {
172 	case DDI_INFO_DEVT2DEVINFO:
173 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
174 			*(dev_info_t **)resp = devinfo;
175 			rc = DDI_SUCCESS;
176 		}
177 		break;
178 	case DDI_INFO_DEVT2INSTANCE:
179 		if (minor > 0 && minor <= DLD_MAX_PPA + 1) {
180 			*(int *)resp = (int)minor - 1;
181 			rc = DDI_SUCCESS;
182 		}
183 		break;
184 	}
185 	return (rc);
186 }
187 
188 /*
189  * qi_qopen: open(9e)
190  */
191 /*ARGSUSED*/
192 int
193 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
194 {
195 	dld_str_t	*dsp;
196 	major_t		major;
197 	minor_t		minor;
198 	int		err;
199 
200 	if (sflag == MODOPEN)
201 		return (ENOTSUP);
202 
203 	/*
204 	 * This is a cloning driver and therefore each queue should only
205 	 * ever get opened once.
206 	 */
207 	if (rq->q_ptr != NULL)
208 		return (EBUSY);
209 
210 	major = getmajor(*devp);
211 	minor = getminor(*devp);
212 	if (minor > DLD_MAX_MINOR)
213 		return (ENODEV);
214 
215 	/*
216 	 * Create a new dld_str_t for the stream. This will grab a new minor
217 	 * number that will be handed back in the cloned dev_t.  Creation may
218 	 * fail if we can't allocate the dummy mblk used for flow-control.
219 	 */
220 	dsp = dld_str_create(rq, DLD_DLPI, major,
221 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
222 	if (dsp == NULL)
223 		return (ENOSR);
224 
225 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
226 	if (minor != 0) {
227 		/*
228 		 * Style 1 open
229 		 */
230 
231 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
232 			goto failed;
233 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
234 	}
235 
236 	/*
237 	 * Enable the queue srv(9e) routine.
238 	 */
239 	qprocson(rq);
240 
241 	/*
242 	 * Construct a cloned dev_t to hand back.
243 	 */
244 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
245 	return (0);
246 
247 failed:
248 	dld_str_destroy(dsp);
249 	return (err);
250 }
251 
252 /*
253  * qi_qclose: close(9e)
254  */
255 int
256 dld_close(queue_t *rq)
257 {
258 	dld_str_t	*dsp = rq->q_ptr;
259 
260 	/*
261 	 * Disable the queue srv(9e) routine.
262 	 */
263 	qprocsoff(rq);
264 
265 	/*
266 	 * At this point we can not be entered by any threads via STREAMS
267 	 * or the direct call interface, which is available only to IP.
268 	 * After the interface is unplumbed, IP wouldn't have any reference
269 	 * to this instance, and therefore we are now effectively single
270 	 * threaded and don't require any lock protection.  Flush all
271 	 * pending packets which are sitting in the transmit queue.
272 	 */
273 	ASSERT(dsp->ds_thr == 0);
274 	dld_tx_flush(dsp);
275 
276 	/*
277 	 * This stream was open to a provider node. Check to see
278 	 * if it has been cleanly shut down.
279 	 */
280 	if (dsp->ds_dlstate != DL_UNATTACHED) {
281 		/*
282 		 * The stream is either open to a style 1 provider or
283 		 * this is not clean shutdown. Detach from the PPA.
284 		 * (This is still ok even in the style 1 case).
285 		 */
286 		dld_str_detach(dsp);
287 	}
288 
289 	dld_str_destroy(dsp);
290 	return (0);
291 }
292 
293 /*
294  * qi_qputp: put(9e)
295  */
296 void
297 dld_wput(queue_t *wq, mblk_t *mp)
298 {
299 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
300 
301 	DLD_ENTER(dsp);
302 
303 	switch (DB_TYPE(mp)) {
304 	case M_DATA:
305 		rw_enter(&dsp->ds_lock, RW_READER);
306 		if (dsp->ds_dlstate != DL_IDLE ||
307 		    dsp->ds_mode == DLD_UNITDATA) {
308 			freemsg(mp);
309 		} else if (dsp->ds_mode == DLD_FASTPATH) {
310 			str_mdata_fastpath_put(dsp, mp);
311 		} else if (dsp->ds_mode == DLD_RAW) {
312 			str_mdata_raw_put(dsp, mp);
313 		}
314 		rw_exit(&dsp->ds_lock);
315 		break;
316 	case M_PROTO:
317 	case M_PCPROTO:
318 		dld_proto(dsp, mp);
319 		break;
320 	case M_IOCTL:
321 		dld_ioc(dsp, mp);
322 		break;
323 	case M_FLUSH:
324 		if (*mp->b_rptr & FLUSHW) {
325 			dld_tx_flush(dsp);
326 			*mp->b_rptr &= ~FLUSHW;
327 		}
328 
329 		if (*mp->b_rptr & FLUSHR) {
330 			qreply(wq, mp);
331 		} else {
332 			freemsg(mp);
333 		}
334 		break;
335 	default:
336 		freemsg(mp);
337 		break;
338 	}
339 
340 	DLD_EXIT(dsp);
341 }
342 
343 /*
344  * qi_srvp: srv(9e)
345  */
346 void
347 dld_wsrv(queue_t *wq)
348 {
349 	mblk_t		*mp;
350 	dld_str_t	*dsp = wq->q_ptr;
351 
352 	DLD_ENTER(dsp);
353 	rw_enter(&dsp->ds_lock, RW_READER);
354 	/*
355 	 * Grab all packets (chained via b_next) off our transmit queue
356 	 * and try to send them all to the MAC layer.  Since the queue
357 	 * is independent of streams, we are able to dequeue all messages
358 	 * at once without looping through getq() and manually chaining
359 	 * them.  Note that the queue size parameters (byte and message
360 	 * counts) are cleared as well, but we postpone the backenabling
361 	 * until after the MAC transmit since some packets may end up
362 	 * back at our transmit queue.
363 	 */
364 	mutex_enter(&dsp->ds_tx_list_lock);
365 	if ((mp = dsp->ds_tx_list_head) == NULL) {
366 		ASSERT(!dsp->ds_tx_qbusy);
367 		ASSERT(dsp->ds_tx_flow_mp != NULL);
368 		ASSERT(dsp->ds_tx_list_head == NULL);
369 		ASSERT(dsp->ds_tx_list_tail == NULL);
370 		ASSERT(dsp->ds_tx_cnt == 0);
371 		ASSERT(dsp->ds_tx_msgcnt == 0);
372 		mutex_exit(&dsp->ds_tx_list_lock);
373 		goto done;
374 	}
375 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
376 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
377 	mutex_exit(&dsp->ds_tx_list_lock);
378 
379 	/*
380 	 * Discard packets unless we are attached and bound; note that
381 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
382 	 * because regardless of the mode all transmit will end up in
383 	 * str_mdata_fastpath_put() where the packets may be queued.
384 	 */
385 	ASSERT(DB_TYPE(mp) == M_DATA);
386 	if (dsp->ds_dlstate != DL_IDLE) {
387 		freemsgchain(mp);
388 		goto done;
389 	}
390 
391 	/*
392 	 * Attempt to transmit one or more packets.  If the MAC can't
393 	 * send them all, re-queue the packet(s) at the beginning of
394 	 * the transmit queue to avoid any re-ordering.
395 	 */
396 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
397 		dld_tx_enqueue(dsp, mp, B_TRUE);
398 
399 	/*
400 	 * Grab the list lock again and check if the transmit queue is
401 	 * really empty; if so, lift up flow-control and backenable any
402 	 * writer queues.  If the queue is not empty, schedule service
403 	 * thread to drain it.
404 	 */
405 	mutex_enter(&dsp->ds_tx_list_lock);
406 	if (dsp->ds_tx_list_head == NULL) {
407 		dsp->ds_tx_flow_mp = getq(wq);
408 		ASSERT(dsp->ds_tx_flow_mp != NULL);
409 		dsp->ds_tx_qbusy = B_FALSE;
410 	}
411 	mutex_exit(&dsp->ds_tx_list_lock);
412 done:
413 	rw_exit(&dsp->ds_lock);
414 	DLD_EXIT(dsp);
415 }
416 
417 void
418 dld_init_ops(struct dev_ops *ops, const char *name)
419 {
420 	struct streamtab *stream;
421 	struct qinit *rq, *wq;
422 	struct module_info *modinfo;
423 
424 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
425 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
426 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
427 	modinfo->mi_minpsz = 0;
428 	modinfo->mi_maxpsz = 64*1024;
429 	modinfo->mi_hiwat  = 1;
430 	modinfo->mi_lowat = 0;
431 
432 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
433 	rq->qi_qopen = dld_open;
434 	rq->qi_qclose = dld_close;
435 	rq->qi_minfo = modinfo;
436 
437 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
438 	wq->qi_putp = (pfi_t)dld_wput;
439 	wq->qi_srvp = (pfi_t)dld_wsrv;
440 	wq->qi_minfo = modinfo;
441 
442 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
443 	stream->st_rdinit = rq;
444 	stream->st_wrinit = wq;
445 	ops->devo_cb_ops->cb_str = stream;
446 
447 	ops->devo_getinfo = &dld_getinfo;
448 }
449 
450 void
451 dld_fini_ops(struct dev_ops *ops)
452 {
453 	struct streamtab *stream;
454 	struct qinit *rq, *wq;
455 	struct module_info *modinfo;
456 
457 	stream = ops->devo_cb_ops->cb_str;
458 	rq = stream->st_rdinit;
459 	wq = stream->st_wrinit;
460 	modinfo = rq->qi_minfo;
461 	ASSERT(wq->qi_minfo == modinfo);
462 
463 	kmem_free(stream, sizeof (struct streamtab));
464 	kmem_free(wq, sizeof (struct qinit));
465 	kmem_free(rq, sizeof (struct qinit));
466 	kmem_free(modinfo->mi_idname, FMNAMESZ);
467 	kmem_free(modinfo, sizeof (struct module_info));
468 }
469 
470 /*
471  * Initialize this module's data structures.
472  */
473 void
474 dld_str_init(void)
475 {
476 	/*
477 	 * Create dld_str_t object cache.
478 	 */
479 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
480 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
481 	ASSERT(str_cachep != NULL);
482 
483 	/*
484 	 * Allocate a vmem arena to manage minor numbers. The range of the
485 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
486 	 * minor number).
487 	 */
488 	minor_arenap = vmem_create("dld_minor_arena",
489 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
490 	    VM_SLEEP | VMC_IDENTIFIER);
491 	ASSERT(minor_arenap != NULL);
492 }
493 
494 /*
495  * Tear down this module's data structures.
496  */
497 int
498 dld_str_fini(void)
499 {
500 	/*
501 	 * Make sure that there are no objects in use.
502 	 */
503 	if (str_count != 0)
504 		return (EBUSY);
505 
506 	/*
507 	 * Check to see if there are any minor numbers still in use.
508 	 */
509 	if (minor_count != 0)
510 		return (EBUSY);
511 
512 	/*
513 	 * Destroy object cache.
514 	 */
515 	kmem_cache_destroy(str_cachep);
516 	vmem_destroy(minor_arenap);
517 	return (0);
518 }
519 
520 /*
521  * Create a new dld_str_t object.
522  */
523 dld_str_t *
524 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
525 {
526 	dld_str_t	*dsp;
527 
528 	/*
529 	 * Allocate an object from the cache.
530 	 */
531 	atomic_add_32(&str_count, 1);
532 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
533 
534 	/*
535 	 * Allocate the dummy mblk for flow-control.
536 	 */
537 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
538 	if (dsp->ds_tx_flow_mp == NULL) {
539 		kmem_cache_free(str_cachep, dsp);
540 		atomic_add_32(&str_count, -1);
541 		return (NULL);
542 	}
543 	dsp->ds_type = type;
544 	dsp->ds_major = major;
545 	dsp->ds_style = style;
546 
547 	/*
548 	 * Initialize the queue pointers.
549 	 */
550 	ASSERT(RD(rq) == rq);
551 	dsp->ds_rq = rq;
552 	dsp->ds_wq = WR(rq);
553 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
554 
555 	/*
556 	 * We want explicit control over our write-side STREAMS queue
557 	 * where the dummy mblk gets added/removed for flow-control.
558 	 */
559 	noenable(WR(rq));
560 
561 	return (dsp);
562 }
563 
564 /*
565  * Destroy a dld_str_t object.
566  */
567 void
568 dld_str_destroy(dld_str_t *dsp)
569 {
570 	queue_t		*rq;
571 	queue_t		*wq;
572 
573 	/*
574 	 * Clear the queue pointers.
575 	 */
576 	rq = dsp->ds_rq;
577 	wq = dsp->ds_wq;
578 	ASSERT(wq == WR(rq));
579 
580 	rq->q_ptr = wq->q_ptr = NULL;
581 	dsp->ds_rq = dsp->ds_wq = NULL;
582 
583 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
584 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
585 	ASSERT(dsp->ds_tx_list_head == NULL);
586 	ASSERT(dsp->ds_tx_list_tail == NULL);
587 	ASSERT(dsp->ds_tx_cnt == 0);
588 	ASSERT(dsp->ds_tx_msgcnt == 0);
589 	ASSERT(!dsp->ds_tx_qbusy);
590 
591 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
592 	ASSERT(dsp->ds_thr == 0);
593 	ASSERT(dsp->ds_detach_req == NULL);
594 
595 	/*
596 	 * Reinitialize all the flags.
597 	 */
598 	dsp->ds_notifications = 0;
599 	dsp->ds_passivestate = DLD_UNINITIALIZED;
600 	dsp->ds_mode = DLD_UNITDATA;
601 
602 	/*
603 	 * Free the dummy mblk if exists.
604 	 */
605 	if (dsp->ds_tx_flow_mp != NULL) {
606 		freeb(dsp->ds_tx_flow_mp);
607 		dsp->ds_tx_flow_mp = NULL;
608 	}
609 	/*
610 	 * Free the object back to the cache.
611 	 */
612 	kmem_cache_free(str_cachep, dsp);
613 	atomic_add_32(&str_count, -1);
614 }
615 
616 /*
617  * kmem_cache contructor function: see kmem_cache_create(9f).
618  */
619 /*ARGSUSED*/
620 static int
621 str_constructor(void *buf, void *cdrarg, int kmflags)
622 {
623 	dld_str_t	*dsp = buf;
624 
625 	bzero(buf, sizeof (dld_str_t));
626 
627 	/*
628 	 * Allocate a new minor number.
629 	 */
630 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
631 		return (-1);
632 
633 	/*
634 	 * Initialize the DLPI state machine.
635 	 */
636 	dsp->ds_dlstate = DL_UNATTACHED;
637 
638 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
639 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
640 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
641 
642 	return (0);
643 }
644 
645 /*
646  * kmem_cache destructor function.
647  */
648 /*ARGSUSED*/
649 static void
650 str_destructor(void *buf, void *cdrarg)
651 {
652 	dld_str_t	*dsp = buf;
653 
654 	/*
655 	 * Make sure the DLPI state machine was reset.
656 	 */
657 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
658 
659 	/*
660 	 * Make sure the data-link interface was closed.
661 	 */
662 	ASSERT(dsp->ds_mh == NULL);
663 	ASSERT(dsp->ds_dc == NULL);
664 
665 	/*
666 	 * Make sure enabled notifications are cleared.
667 	 */
668 	ASSERT(dsp->ds_notifications == 0);
669 
670 	/*
671 	 * Make sure polling is disabled.
672 	 */
673 	ASSERT(!dsp->ds_polling);
674 
675 	/*
676 	 * Release the minor number.
677 	 */
678 	dld_minor_rele(dsp->ds_minor);
679 
680 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
681 	rw_destroy(&dsp->ds_lock);
682 
683 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
684 	mutex_destroy(&dsp->ds_tx_list_lock);
685 	ASSERT(dsp->ds_tx_flow_mp == NULL);
686 
687 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
688 	mutex_destroy(&dsp->ds_thr_lock);
689 	ASSERT(dsp->ds_detach_req == NULL);
690 }
691 
692 /*
693  * M_DATA put (IP fast-path mode)
694  */
695 void
696 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
697 {
698 	/*
699 	 * We get here either as a result of putnext() from above or
700 	 * because IP has called us directly.  If we are in the busy
701 	 * mode enqueue the packet(s) and return.  Otherwise hand them
702 	 * over to the MAC driver for transmission; any remaining one(s)
703 	 * which didn't get sent will be queued.
704 	 *
705 	 * Note here that we don't grab the list lock prior to checking
706 	 * the busy flag.  This is okay, because a missed transition
707 	 * will not cause any packet reordering for any particular TCP
708 	 * connection (which is single-threaded).  The enqueue routine
709 	 * will atomically set the busy flag and schedule the service
710 	 * thread to run; the flag is only cleared by the service thread
711 	 * when there is no more packet to be transmitted.
712 	 */
713 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
714 		dld_tx_enqueue(dsp, mp, B_FALSE);
715 }
716 
717 /*
718  * M_DATA put (raw mode)
719  */
720 void
721 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
722 {
723 	struct ether_header	*ehp;
724 	mblk_t			*bp;
725 	size_t			size;
726 	size_t			hdrlen;
727 
728 	size = MBLKL(mp);
729 	if (size < sizeof (struct ether_header))
730 		goto discard;
731 
732 	hdrlen = sizeof (struct ether_header);
733 
734 	ehp = (struct ether_header *)mp->b_rptr;
735 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
736 		struct ether_vlan_header	*evhp;
737 
738 		if (size < sizeof (struct ether_vlan_header))
739 			goto discard;
740 
741 		/*
742 		 * Replace vtag with our own
743 		 */
744 		evhp = (struct ether_vlan_header *)ehp;
745 		evhp->ether_tci = htons(VLAN_TCI(dsp->ds_pri,
746 		    ETHER_CFI, dsp->ds_vid));
747 		hdrlen = sizeof (struct ether_vlan_header);
748 	}
749 
750 	/*
751 	 * Check the packet is not too big and that any remaining
752 	 * fragment list is composed entirely of M_DATA messages. (We
753 	 * know the first fragment was M_DATA otherwise we could not
754 	 * have got here).
755 	 */
756 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
757 		if (DB_TYPE(bp) != M_DATA)
758 			goto discard;
759 		size += MBLKL(bp);
760 	}
761 
762 	if (size > dsp->ds_mip->mi_sdu_max + hdrlen)
763 		goto discard;
764 
765 	str_mdata_fastpath_put(dsp, mp);
766 	return;
767 
768 discard:
769 	freemsg(mp);
770 }
771 
772 /*
773  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
774  */
775 int
776 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
777 {
778 	int			err;
779 	const char		*drvname;
780 	char			name[MAXNAMELEN];
781 	dls_channel_t		dc;
782 	uint_t			addr_length;
783 
784 	ASSERT(dsp->ds_dc == NULL);
785 
786 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
787 		return (EINVAL);
788 
789 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
790 
791 	if (strcmp(drvname, "aggr") != 0 &&
792 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
793 		return (EINVAL);
794 
795 	/*
796 	 * Open a channel.
797 	 */
798 	if ((err = dls_open(name, &dc)) != 0) {
799 		(void) qassociate(dsp->ds_wq, -1);
800 		return (err);
801 	}
802 
803 	/*
804 	 * Cache the MAC interface handle, a pointer to the immutable MAC
805 	 * information and the current and 'factory' MAC address.
806 	 */
807 	dsp->ds_mh = dls_mac(dc);
808 	dsp->ds_mip = mac_info(dsp->ds_mh);
809 
810 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
811 
812 	addr_length = dsp->ds_mip->mi_addr_length;
813 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
814 
815 	/*
816 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
817 	 * a non-VLAN interface).
818 	 */
819 	dsp->ds_vid = dls_vid(dc);
820 
821 	/*
822 	 * Set the default packet priority.
823 	 */
824 	dsp->ds_pri = 0;
825 
826 	/*
827 	 * Add a notify function so that the we get updates from the MAC.
828 	 */
829 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
830 
831 	dsp->ds_dc = dc;
832 	dsp->ds_dlstate = DL_UNBOUND;
833 
834 	return (0);
835 }
836 
837 /*
838  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
839  * from close(2) for style 2.
840  */
841 void
842 dld_str_detach(dld_str_t *dsp)
843 {
844 	ASSERT(dsp->ds_thr == 0);
845 
846 	/*
847 	 * Remove the notify function.
848 	 */
849 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
850 
851 	/*
852 	 * Re-initialize the DLPI state machine.
853 	 */
854 	dsp->ds_dlstate = DL_UNATTACHED;
855 
856 	/*
857 	 * Clear the polling and promisc flags.
858 	 */
859 	dsp->ds_polling = B_FALSE;
860 	dsp->ds_promisc = 0;
861 
862 	/*
863 	 * Close the channel.
864 	 */
865 	dls_close(dsp->ds_dc);
866 	dsp->ds_dc = NULL;
867 	dsp->ds_mh = NULL;
868 
869 	(void) qassociate(dsp->ds_wq, -1);
870 }
871 
872 /*
873  * Raw mode receive function.
874  */
875 /*ARGSUSED*/
876 void
877 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
878     size_t header_length)
879 {
880 	dld_str_t		*dsp = (dld_str_t *)arg;
881 	mblk_t			*next;
882 
883 	ASSERT(mp != NULL);
884 	do {
885 		/*
886 		 * Get the pointer to the next packet in the chain and then
887 		 * clear b_next before the packet gets passed on.
888 		 */
889 		next = mp->b_next;
890 		mp->b_next = NULL;
891 
892 		/*
893 		 * Wind back b_rptr to point at the MAC header.
894 		 */
895 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
896 		mp->b_rptr -= header_length;
897 		if (header_length == sizeof (struct ether_vlan_header)) {
898 			/*
899 			 * Strip off the vtag
900 			 */
901 			ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ,
902 			    2 * ETHERADDRL);
903 			mp->b_rptr += VLAN_TAGSZ;
904 		}
905 
906 		/*
907 		 * Pass the packet on.
908 		 */
909 		putnext(dsp->ds_rq, mp);
910 
911 		/*
912 		 * Move on to the next packet in the chain.
913 		 */
914 		mp = next;
915 	} while (mp != NULL);
916 }
917 
918 /*
919  * Fast-path receive function.
920  */
921 /*ARGSUSED*/
922 void
923 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
924     size_t header_length)
925 {
926 	dld_str_t		*dsp = (dld_str_t *)arg;
927 	mblk_t			*next;
928 
929 	ASSERT(mp != NULL);
930 	do {
931 		/*
932 		 * Get the pointer to the next packet in the chain and then
933 		 * clear b_next before the packet gets passed on.
934 		 */
935 		next = mp->b_next;
936 		mp->b_next = NULL;
937 
938 		/*
939 		 * Pass the packet on.
940 		 */
941 		putnext(dsp->ds_rq, mp);
942 
943 		/*
944 		 * Move on to the next packet in the chain.
945 		 */
946 		mp = next;
947 	} while (mp != NULL);
948 }
949 
950 /*
951  * Default receive function (send DL_UNITDATA_IND messages).
952  */
953 /*ARGSUSED*/
954 void
955 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
956     size_t header_length)
957 {
958 	dld_str_t		*dsp = (dld_str_t *)arg;
959 	mblk_t			*ud_mp;
960 	mblk_t			*next;
961 
962 	ASSERT(mp != NULL);
963 	do {
964 		/*
965 		 * Get the pointer to the next packet in the chain and then
966 		 * clear b_next before the packet gets passed on.
967 		 */
968 		next = mp->b_next;
969 		mp->b_next = NULL;
970 
971 		/*
972 		 * Wind back b_rptr to point at the MAC header.
973 		 */
974 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
975 		mp->b_rptr -= header_length;
976 
977 		/*
978 		 * Create the DL_UNITDATA_IND M_PROTO.
979 		 */
980 		if ((ud_mp = str_unitdata_ind(dsp, mp)) == NULL) {
981 			freemsgchain(mp);
982 			return;
983 		}
984 
985 		/*
986 		 * Advance b_rptr to point at the payload again.
987 		 */
988 		mp->b_rptr += header_length;
989 
990 		/*
991 		 * Prepend the DL_UNITDATA_IND.
992 		 */
993 		ud_mp->b_cont = mp;
994 
995 		/*
996 		 * Send the message.
997 		 */
998 		putnext(dsp->ds_rq, ud_mp);
999 
1000 		/*
1001 		 * Move on to the next packet in the chain.
1002 		 */
1003 		mp = next;
1004 	} while (mp != NULL);
1005 }
1006 
1007 /*
1008  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1009  * current state of the interface.
1010  */
1011 void
1012 dld_str_notify_ind(dld_str_t *dsp)
1013 {
1014 	mac_notify_type_t	type;
1015 
1016 	for (type = 0; type < MAC_NNOTE; type++)
1017 		str_notify(dsp, type);
1018 }
1019 
1020 typedef struct dl_unitdata_ind_wrapper {
1021 	dl_unitdata_ind_t	dl_unitdata;
1022 	uint8_t			dl_dest_addr[MAXADDRLEN + sizeof (uint16_t)];
1023 	uint8_t			dl_src_addr[MAXADDRLEN + sizeof (uint16_t)];
1024 } dl_unitdata_ind_wrapper_t;
1025 
1026 /*
1027  * Create a DL_UNITDATA_IND M_PROTO message.
1028  */
1029 static mblk_t *
1030 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp)
1031 {
1032 	mblk_t				*nmp;
1033 	dl_unitdata_ind_wrapper_t	*dlwp;
1034 	dl_unitdata_ind_t		*dlp;
1035 	dls_header_info_t		dhi;
1036 	uint_t				addr_length;
1037 	uint8_t				*daddr;
1038 	uint8_t				*saddr;
1039 
1040 	/*
1041 	 * Get the packet header information.
1042 	 */
1043 	dls_header_info(dsp->ds_dc, mp, &dhi);
1044 
1045 	/*
1046 	 * Allocate a message large enough to contain the wrapper structure
1047 	 * defined above.
1048 	 */
1049 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1050 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1051 	    DL_UNITDATA_IND)) == NULL)
1052 		return (NULL);
1053 
1054 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1055 
1056 	dlp = &(dlwp->dl_unitdata);
1057 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1058 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1059 
1060 	/*
1061 	 * Copy in the destination address.
1062 	 */
1063 	addr_length = dsp->ds_mip->mi_addr_length;
1064 	daddr = dlwp->dl_dest_addr;
1065 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1066 	bcopy(dhi.dhi_daddr, daddr, addr_length);
1067 
1068 	/*
1069 	 * Set the destination DLSAP to our bound DLSAP value.
1070 	 */
1071 	*(uint16_t *)(daddr + addr_length) = dsp->ds_sap;
1072 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1073 
1074 	/*
1075 	 * If the destination address was a group address then
1076 	 * dl_group_address field should be non-zero.
1077 	 */
1078 	dlp->dl_group_address = dhi.dhi_isgroup;
1079 
1080 	/*
1081 	 * Copy in the source address.
1082 	 */
1083 	saddr = dlwp->dl_src_addr;
1084 	dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1085 	bcopy(dhi.dhi_saddr, saddr, addr_length);
1086 
1087 	/*
1088 	 * Set the source DLSAP to the packet ethertype.
1089 	 */
1090 	*(uint16_t *)(saddr + addr_length) = dhi.dhi_ethertype;
1091 	dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1092 
1093 	return (nmp);
1094 }
1095 
1096 /*
1097  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1098  */
1099 static void
1100 str_notify_promisc_on_phys(dld_str_t *dsp)
1101 {
1102 	mblk_t		*mp;
1103 	dl_notify_ind_t	*dlip;
1104 
1105 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1106 		return;
1107 
1108 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1109 	    M_PROTO, 0)) == NULL)
1110 		return;
1111 
1112 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1113 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1114 	dlip->dl_primitive = DL_NOTIFY_IND;
1115 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1116 
1117 	qreply(dsp->ds_wq, mp);
1118 }
1119 
1120 /*
1121  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1122  */
1123 static void
1124 str_notify_promisc_off_phys(dld_str_t *dsp)
1125 {
1126 	mblk_t		*mp;
1127 	dl_notify_ind_t	*dlip;
1128 
1129 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1130 		return;
1131 
1132 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1133 	    M_PROTO, 0)) == NULL)
1134 		return;
1135 
1136 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1137 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1138 	dlip->dl_primitive = DL_NOTIFY_IND;
1139 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1140 
1141 	qreply(dsp->ds_wq, mp);
1142 }
1143 
1144 /*
1145  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1146  */
1147 static void
1148 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1149 {
1150 	mblk_t		*mp;
1151 	dl_notify_ind_t	*dlip;
1152 	uint_t		addr_length;
1153 	uint16_t	ethertype;
1154 
1155 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1156 		return;
1157 
1158 	addr_length = dsp->ds_mip->mi_addr_length;
1159 	if ((mp = mexchange(dsp->ds_wq, NULL,
1160 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1161 	    M_PROTO, 0)) == NULL)
1162 		return;
1163 
1164 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1165 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1166 	dlip->dl_primitive = DL_NOTIFY_IND;
1167 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1168 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1169 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1170 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1171 
1172 	bcopy(addr, &dlip[1], addr_length);
1173 
1174 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1175 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1176 		ethertype;
1177 
1178 	qreply(dsp->ds_wq, mp);
1179 }
1180 
1181 /*
1182  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1183  */
1184 static void
1185 str_notify_link_up(dld_str_t *dsp)
1186 {
1187 	mblk_t		*mp;
1188 	dl_notify_ind_t	*dlip;
1189 
1190 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1191 		return;
1192 
1193 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1194 	    M_PROTO, 0)) == NULL)
1195 		return;
1196 
1197 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1198 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1199 	dlip->dl_primitive = DL_NOTIFY_IND;
1200 	dlip->dl_notification = DL_NOTE_LINK_UP;
1201 
1202 	qreply(dsp->ds_wq, mp);
1203 }
1204 
1205 /*
1206  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1207  */
1208 static void
1209 str_notify_link_down(dld_str_t *dsp)
1210 {
1211 	mblk_t		*mp;
1212 	dl_notify_ind_t	*dlip;
1213 
1214 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1215 		return;
1216 
1217 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1218 	    M_PROTO, 0)) == NULL)
1219 		return;
1220 
1221 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1222 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1223 	dlip->dl_primitive = DL_NOTIFY_IND;
1224 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1225 
1226 	qreply(dsp->ds_wq, mp);
1227 }
1228 
1229 /*
1230  * DL_NOTIFY_IND: DL_NOTE_SPEED
1231  */
1232 static void
1233 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1234 {
1235 	mblk_t		*mp;
1236 	dl_notify_ind_t	*dlip;
1237 
1238 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1239 		return;
1240 
1241 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1242 	    M_PROTO, 0)) == NULL)
1243 		return;
1244 
1245 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1246 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1247 	dlip->dl_primitive = DL_NOTIFY_IND;
1248 	dlip->dl_notification = DL_NOTE_SPEED;
1249 	dlip->dl_data = speed;
1250 
1251 	qreply(dsp->ds_wq, mp);
1252 }
1253 
1254 /*
1255  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1256  */
1257 static void
1258 str_notify_capab_reneg(dld_str_t *dsp)
1259 {
1260 	mblk_t		*mp;
1261 	dl_notify_ind_t	*dlip;
1262 
1263 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1264 		return;
1265 
1266 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1267 	    M_PROTO, 0)) == NULL)
1268 		return;
1269 
1270 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1271 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1272 	dlip->dl_primitive = DL_NOTIFY_IND;
1273 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1274 
1275 	qreply(dsp->ds_wq, mp);
1276 }
1277 
1278 /*
1279  * MAC notification callback.
1280  */
1281 static void
1282 str_notify(void *arg, mac_notify_type_t type)
1283 {
1284 	dld_str_t		*dsp = (dld_str_t *)arg;
1285 	queue_t			*q = dsp->ds_wq;
1286 
1287 	switch (type) {
1288 	case MAC_NOTE_TX:
1289 		qenable(q);
1290 		break;
1291 
1292 	case MAC_NOTE_DEVPROMISC:
1293 		/*
1294 		 * Send the appropriate DL_NOTIFY_IND.
1295 		 */
1296 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1297 			str_notify_promisc_on_phys(dsp);
1298 		else
1299 			str_notify_promisc_off_phys(dsp);
1300 		break;
1301 
1302 	case MAC_NOTE_PROMISC:
1303 		break;
1304 
1305 	case MAC_NOTE_UNICST:
1306 		/*
1307 		 * This notification is sent whenever the MAC unicast address
1308 		 * changes. We need to re-cache the address.
1309 		 */
1310 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1311 
1312 		/*
1313 		 * Send the appropriate DL_NOTIFY_IND.
1314 		 */
1315 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1316 		break;
1317 
1318 	case MAC_NOTE_LINK:
1319 		/*
1320 		 * This notification is sent every time the MAC driver
1321 		 * updates the link state.
1322 		 */
1323 		switch (mac_link_get(dsp->ds_mh)) {
1324 		case LINK_STATE_UP:
1325 			/*
1326 			 * The link is up so send the appropriate
1327 			 * DL_NOTIFY_IND.
1328 			 */
1329 			str_notify_link_up(dsp);
1330 
1331 			/*
1332 			 * If we can find the link speed then send a
1333 			 * DL_NOTIFY_IND for that too.
1334 			 */
1335 			if (dsp->ds_mip->mi_stat[MAC_STAT_IFSPEED]) {
1336 				uint64_t	val;
1337 
1338 				val = mac_stat_get(dsp->ds_mh,
1339 				    MAC_STAT_IFSPEED);
1340 				str_notify_speed(dsp,
1341 				    (uint32_t)(val / 1000ull));
1342 			}
1343 			break;
1344 
1345 		case LINK_STATE_DOWN:
1346 			/*
1347 			 * The link is down so send the appropriate
1348 			 * DL_NOTIFY_IND.
1349 			 */
1350 			str_notify_link_down(dsp);
1351 			break;
1352 
1353 		default:
1354 			break;
1355 		}
1356 		break;
1357 
1358 	case MAC_NOTE_RESOURCE:
1359 		/*
1360 		 * This notification is sent whenever the MAC resources
1361 		 * change. We need to renegotiate the capabilities.
1362 		 * Send the appropriate DL_NOTIFY_IND.
1363 		 */
1364 		str_notify_capab_reneg(dsp);
1365 		break;
1366 
1367 	default:
1368 		ASSERT(B_FALSE);
1369 		break;
1370 	}
1371 }
1372 
1373 /*
1374  * Enqueue one or more messages to the transmit queue.
1375  * Caller specifies the insertion position (head/tail).
1376  */
1377 void
1378 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1379 {
1380 	mblk_t	*tail;
1381 	queue_t *q = dsp->ds_wq;
1382 	uint_t	cnt, msgcnt;
1383 	uint_t	tot_cnt, tot_msgcnt;
1384 
1385 	ASSERT(DB_TYPE(mp) == M_DATA);
1386 	/* Calculate total size and count of the packet(s) */
1387 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1388 	    tail->b_next != NULL; tail = tail->b_next) {
1389 		ASSERT(DB_TYPE(tail) == M_DATA);
1390 		cnt += msgdsize(tail);
1391 		msgcnt++;
1392 	}
1393 
1394 	mutex_enter(&dsp->ds_tx_list_lock);
1395 	/*
1396 	 * If the queue depth would exceed the allowed threshold, drop
1397 	 * new packet(s) and drain those already in the queue.
1398 	 */
1399 	tot_cnt = dsp->ds_tx_cnt + cnt;
1400 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1401 
1402 	if (!head_insert &&
1403 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1404 		ASSERT(dsp->ds_tx_qbusy);
1405 		mutex_exit(&dsp->ds_tx_list_lock);
1406 		freemsgchain(mp);
1407 		goto done;
1408 	}
1409 
1410 	/* Update the queue size parameters */
1411 	dsp->ds_tx_cnt = tot_cnt;
1412 	dsp->ds_tx_msgcnt = tot_msgcnt;
1413 
1414 	/*
1415 	 * If the transmit queue is currently empty and we are
1416 	 * about to deposit the packet(s) there, switch mode to
1417 	 * "busy" and raise flow-control condition.
1418 	 */
1419 	if (!dsp->ds_tx_qbusy) {
1420 		dsp->ds_tx_qbusy = B_TRUE;
1421 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1422 		(void) putq(q, dsp->ds_tx_flow_mp);
1423 		dsp->ds_tx_flow_mp = NULL;
1424 	}
1425 
1426 	if (!head_insert) {
1427 		/* Tail insertion */
1428 		if (dsp->ds_tx_list_head == NULL)
1429 			dsp->ds_tx_list_head = mp;
1430 		else
1431 			dsp->ds_tx_list_tail->b_next = mp;
1432 		dsp->ds_tx_list_tail = tail;
1433 	} else {
1434 		/* Head insertion */
1435 		tail->b_next = dsp->ds_tx_list_head;
1436 		if (dsp->ds_tx_list_head == NULL)
1437 			dsp->ds_tx_list_tail = tail;
1438 		dsp->ds_tx_list_head = mp;
1439 	}
1440 	mutex_exit(&dsp->ds_tx_list_lock);
1441 done:
1442 	/* Schedule service thread to drain the transmit queue */
1443 	qenable(q);
1444 }
1445 
1446 void
1447 dld_tx_flush(dld_str_t *dsp)
1448 {
1449 	mutex_enter(&dsp->ds_tx_list_lock);
1450 	if (dsp->ds_tx_list_head != NULL) {
1451 		freemsgchain(dsp->ds_tx_list_head);
1452 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1453 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1454 		if (dsp->ds_tx_qbusy) {
1455 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1456 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1457 			dsp->ds_tx_qbusy = B_FALSE;
1458 		}
1459 	}
1460 	mutex_exit(&dsp->ds_tx_list_lock);
1461 }
1462 
1463 /*
1464  * Process an M_IOCTL message.
1465  */
1466 static void
1467 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1468 {
1469 	uint_t			cmd;
1470 
1471 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1472 	ASSERT(dsp->ds_type == DLD_DLPI);
1473 
1474 	switch (cmd) {
1475 	case DLIOCRAW:
1476 		ioc_raw(dsp, mp);
1477 		break;
1478 	case DLIOCHDRINFO:
1479 		ioc_fast(dsp, mp);
1480 		break;
1481 	default:
1482 		ioc(dsp, mp);
1483 	}
1484 }
1485 
1486 /*
1487  * DLIOCRAW
1488  */
1489 static void
1490 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1491 {
1492 	queue_t *q = dsp->ds_wq;
1493 
1494 	rw_enter(&dsp->ds_lock, RW_WRITER);
1495 	if (dsp->ds_polling) {
1496 		rw_exit(&dsp->ds_lock);
1497 		miocnak(q, mp, 0, EPROTO);
1498 		return;
1499 	}
1500 
1501 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1502 		/*
1503 		 * Set the receive callback.
1504 		 */
1505 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1506 
1507 		/*
1508 		 * Note that raw mode is enabled.
1509 		 */
1510 		dsp->ds_mode = DLD_RAW;
1511 	}
1512 
1513 	rw_exit(&dsp->ds_lock);
1514 	miocack(q, mp, 0, 0);
1515 }
1516 
1517 /*
1518  * DLIOCHDRINFO
1519  */
1520 static void
1521 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1522 {
1523 	dl_unitdata_req_t *dlp;
1524 	off_t		off;
1525 	size_t		len;
1526 	const uint8_t	*addr;
1527 	uint16_t	sap;
1528 	mblk_t		*nmp;
1529 	mblk_t		*hmp;
1530 	uint_t		addr_length;
1531 	queue_t		*q = dsp->ds_wq;
1532 	int		err;
1533 	dls_channel_t	dc;
1534 
1535 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1536 		err = ENOTSUP;
1537 		goto failed;
1538 	}
1539 
1540 	nmp = mp->b_cont;
1541 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1542 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1543 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1544 		err = EINVAL;
1545 		goto failed;
1546 	}
1547 
1548 	off = dlp->dl_dest_addr_offset;
1549 	len = dlp->dl_dest_addr_length;
1550 
1551 	if (!MBLKIN(nmp, off, len)) {
1552 		err = EINVAL;
1553 		goto failed;
1554 	}
1555 
1556 	rw_enter(&dsp->ds_lock, RW_READER);
1557 	if (dsp->ds_dlstate != DL_IDLE) {
1558 		rw_exit(&dsp->ds_lock);
1559 		err = ENOTSUP;
1560 		goto failed;
1561 	}
1562 
1563 	addr_length = dsp->ds_mip->mi_addr_length;
1564 	if (len != addr_length + sizeof (uint16_t)) {
1565 		rw_exit(&dsp->ds_lock);
1566 		err = EINVAL;
1567 		goto failed;
1568 	}
1569 
1570 	addr = nmp->b_rptr + off;
1571 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
1572 	dc = dsp->ds_dc;
1573 
1574 	if ((hmp = dls_header(dc, addr, sap, dsp->ds_pri)) == NULL) {
1575 		rw_exit(&dsp->ds_lock);
1576 		err = ENOMEM;
1577 		goto failed;
1578 	}
1579 
1580 	/*
1581 	 * This is a performance optimization.  We originally entered
1582 	 * as reader and only become writer upon transitioning into
1583 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
1584 	 * stay as reader and return the fast-path header to IP.
1585 	 */
1586 	if (dsp->ds_mode != DLD_FASTPATH) {
1587 		if (!rw_tryupgrade(&dsp->ds_lock)) {
1588 			rw_exit(&dsp->ds_lock);
1589 			rw_enter(&dsp->ds_lock, RW_WRITER);
1590 
1591 			/*
1592 			 * State may have changed before we re-acquired
1593 			 * the writer lock in case the upgrade failed.
1594 			 */
1595 			if (dsp->ds_dlstate != DL_IDLE) {
1596 				rw_exit(&dsp->ds_lock);
1597 				err = ENOTSUP;
1598 				goto failed;
1599 			}
1600 		}
1601 
1602 		/*
1603 		 * Set the receive callback (unless polling is enabled).
1604 		 */
1605 		if (!dsp->ds_polling)
1606 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
1607 
1608 		/*
1609 		 * Note that fast-path mode is enabled.
1610 		 */
1611 		dsp->ds_mode = DLD_FASTPATH;
1612 	}
1613 	rw_exit(&dsp->ds_lock);
1614 
1615 	freemsg(nmp->b_cont);
1616 	nmp->b_cont = hmp;
1617 
1618 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
1619 	return;
1620 failed:
1621 	miocnak(q, mp, 0, err);
1622 }
1623 
1624 /*
1625  * Catch-all handler.
1626  */
1627 static void
1628 ioc(dld_str_t *dsp, mblk_t *mp)
1629 {
1630 	queue_t	*q = dsp->ds_wq;
1631 	mac_handle_t mh;
1632 
1633 	rw_enter(&dsp->ds_lock, RW_READER);
1634 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1635 		rw_exit(&dsp->ds_lock);
1636 		miocnak(q, mp, 0, EINVAL);
1637 		return;
1638 	}
1639 	mh = dsp->ds_mh;
1640 	ASSERT(mh != NULL);
1641 	rw_exit(&dsp->ds_lock);
1642 	mac_ioctl(mh, q, mp);
1643 }
1644 
1645 /*
1646  * Allocate a new minor number.
1647  */
1648 static minor_t
1649 dld_minor_hold(boolean_t sleep)
1650 {
1651 	minor_t		minor;
1652 
1653 	/*
1654 	 * Grab a value from the arena.
1655 	 */
1656 	atomic_add_32(&minor_count, 1);
1657 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
1658 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
1659 		atomic_add_32(&minor_count, -1);
1660 		return (0);
1661 	}
1662 
1663 	return (minor);
1664 }
1665 
1666 /*
1667  * Release a previously allocated minor number.
1668  */
1669 static void
1670 dld_minor_rele(minor_t minor)
1671 {
1672 	/*
1673 	 * Return the value to the arena.
1674 	 */
1675 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
1676 
1677 	atomic_add_32(&minor_count, -1);
1678 }
1679