xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision 7b209c2c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Data-Link Driver
30  */
31 
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/strsubr.h>
35 #include	<sys/atomic.h>
36 #include	<sys/disp.h>
37 #include	<sys/callb.h>
38 #include	<sys/vlan.h>
39 #include	<sys/dld.h>
40 #include	<sys/dld_impl.h>
41 #include	<sys/dls_impl.h>
42 #include	<inet/common.h>
43 
44 static int	str_constructor(void *, void *, int);
45 static void	str_destructor(void *, void *);
46 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
47 static void	str_notify_promisc_on_phys(dld_str_t *);
48 static void	str_notify_promisc_off_phys(dld_str_t *);
49 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
50 static void	str_notify_link_up(dld_str_t *);
51 static void	str_notify_link_down(dld_str_t *);
52 static void	str_notify_capab_reneg(dld_str_t *);
53 static void	str_notify_speed(dld_str_t *, uint32_t);
54 static void	str_notify(void *, mac_notify_type_t);
55 
56 static void	ioc_native(dld_str_t *,  mblk_t *);
57 static void	ioc_margin(dld_str_t *, mblk_t *);
58 static void	ioc_raw(dld_str_t *, mblk_t *);
59 static void	ioc_fast(dld_str_t *,  mblk_t *);
60 static void	ioc(dld_str_t *, mblk_t *);
61 static void	dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t,
62 		    uint_t, uint_t);
63 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
64 static void	dld_wput_nondata_task(void *);
65 static void	dld_flush_nondata(dld_str_t *);
66 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
67 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
68 
69 static uint32_t		str_count;
70 static kmem_cache_t	*str_cachep;
71 static taskq_t		*dld_disp_taskq = NULL;
72 static mod_hash_t	*str_hashp;
73 
74 #define	STR_HASHSZ		64
75 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
76 
77 static inline uint_t	mp_getsize(mblk_t *);
78 
79 /*
80  * Interval to count the TX queued depth. Default is 1s (1000000us).
81  * Count the queue depth immediately (not by timeout) if this is set to 0.
82  * See more details above dld_tx_enqueue().
83  */
84 uint_t tx_qdepth_interval = 1000000;
85 
86 /*
87  * Some notes on entry points, flow-control, queueing and locking:
88  *
89  * This driver exports the traditional STREAMS put entry point as well as
90  * the non-STREAMS fast-path transmit routine which is provided to IP via
91  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
92  * and data operations, while the fast-path routine deals only with M_DATA
93  * fast-path packets.  Regardless of the entry point, all outbound packets
94  * will end up in dld_tx_single(), where they will be delivered to the MAC
95  * driver.
96  *
97  * The transmit logic operates in two modes: a "not busy" mode where the
98  * packets will be delivered to the MAC for a send attempt, or "busy" mode
99  * where they will be enqueued in the internal queue because of flow-control.
100  * Flow-control happens when the MAC driver indicates the packets couldn't
101  * be transmitted due to lack of resources (e.g. running out of descriptors).
102  * In such case, the driver will place a dummy message on its write-side
103  * STREAMS queue so that the queue is marked as "full".  Any subsequent
104  * packets arriving at the driver will be enqueued in the internal queue,
105  * which is drained in the context of the service thread that gets scheduled
106  * whenever the driver is in the "busy" mode.  When all packets have been
107  * successfully delivered by MAC and the internal queue is empty, it will
108  * transition to the "not busy" mode by removing the dummy message from the
109  * write-side STREAMS queue; in effect this will trigger backenabling.
110  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
111  * to the above reasons.
112  *
113  * The driver implements an internal transmit queue independent of STREAMS.
114  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
115  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
116  * getq() operations done by the driver are those related to placing and
117  * removing the dummy message to/from the write-side STREAMS queue for flow-
118  * control purposes.
119  *
120  * Locking is done independent of STREAMS due to the driver being fully MT.
121  * Threads entering the driver (either from put or service entry points)
122  * will most likely be readers, with the exception of a few writer cases
123  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
124  * DLD-related ioctl requests.  The DLPI detach case is special, because
125  * it involves freeing resources and therefore must be single-threaded.
126  * Unfortunately the readers/writers lock can't be used to protect against
127  * it, because the lock is dropped prior to the driver calling places where
128  * putnext() may be invoked, and such places may depend on those resources
129  * to exist.  Because of this, the driver always completes the DLPI detach
130  * process when there are no other threads running in the driver.  This is
131  * done by keeping track of the number of threads, such that the the last
132  * thread leaving the driver will finish the pending DLPI detach operation.
133  */
134 
135 /*
136  * dld_max_q_count is the queue depth threshold used to limit the number of
137  * outstanding packets or bytes allowed in the queue; once this limit is
138  * reached the driver will free any incoming ones until the queue depth
139  * drops below the threshold.
140  *
141  * This buffering is provided to accomodate clients which do not employ
142  * their own buffering scheme, and to handle occasional packet bursts.
143  * Clients which handle their own buffering will receive positive feedback
144  * from this driver as soon as it transitions into the "busy" state, i.e.
145  * when the queue is initially filled up; they will get backenabled once
146  * the queue is empty.
147  *
148  * The value chosen here is rather arbitrary; in future some intelligent
149  * heuristics may be involved which could take into account the hardware's
150  * transmit ring size, etc.
151  */
152 uint_t dld_max_q_count = (16 * 1024 *1024);
153 
154 /*
155  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
156  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
157  * match dev_t. If a stream is found and it is attached, its dev_info_t *
158  * is returned.
159  */
160 typedef struct i_dld_str_state_s {
161 	major_t		ds_major;
162 	minor_t		ds_minor;
163 	dev_info_t	*ds_dip;
164 } i_dld_str_state_t;
165 
166 /* ARGSUSED */
167 static uint_t
168 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
169 {
170 	i_dld_str_state_t	*statep = arg;
171 	dld_str_t		*dsp = (dld_str_t *)val;
172 
173 	if (statep->ds_major != dsp->ds_major)
174 		return (MH_WALK_CONTINUE);
175 
176 	ASSERT(statep->ds_minor != 0);
177 
178 	/*
179 	 * Access to ds_mh needs to be protected by ds_lock.
180 	 */
181 	rw_enter(&dsp->ds_lock, RW_READER);
182 	if (statep->ds_minor == dsp->ds_minor) {
183 		/*
184 		 * Clone: a clone minor is unique. we can terminate the
185 		 * walk if we find a matching stream -- even if we fail
186 		 * to obtain the devinfo.
187 		 */
188 		if (dsp->ds_mh != NULL)
189 			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
190 		rw_exit(&dsp->ds_lock);
191 		return (MH_WALK_TERMINATE);
192 	}
193 	rw_exit(&dsp->ds_lock);
194 	return (MH_WALK_CONTINUE);
195 }
196 
197 static dev_info_t *
198 dld_finddevinfo(dev_t dev)
199 {
200 	dev_info_t	*dip;
201 	i_dld_str_state_t	state;
202 
203 	if (getminor(dev) == 0)
204 		return (NULL);
205 
206 	/*
207 	 * See if it's a minor node of a link
208 	 */
209 	if ((dip = dls_finddevinfo(dev)) != NULL)
210 		return (dip);
211 
212 	state.ds_minor = getminor(dev);
213 	state.ds_major = getmajor(dev);
214 	state.ds_dip = NULL;
215 
216 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
217 	return (state.ds_dip);
218 }
219 
220 /*
221  * devo_getinfo: getinfo(9e)
222  */
223 /*ARGSUSED*/
224 int
225 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
226 {
227 	dev_info_t	*devinfo;
228 	minor_t		minor = getminor((dev_t)arg);
229 	int		rc = DDI_FAILURE;
230 
231 	switch (cmd) {
232 	case DDI_INFO_DEVT2DEVINFO:
233 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
234 			*(dev_info_t **)resp = devinfo;
235 			rc = DDI_SUCCESS;
236 		}
237 		break;
238 	case DDI_INFO_DEVT2INSTANCE:
239 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
240 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
241 			rc = DDI_SUCCESS;
242 		} else if (minor > DLS_MAX_MINOR &&
243 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
244 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
245 			rc = DDI_SUCCESS;
246 		}
247 		break;
248 	}
249 	return (rc);
250 }
251 
252 /*
253  * qi_qopen: open(9e)
254  */
255 /*ARGSUSED*/
256 int
257 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
258 {
259 	dld_str_t	*dsp;
260 	major_t		major;
261 	minor_t		minor;
262 	int		err;
263 
264 	if (sflag == MODOPEN)
265 		return (ENOTSUP);
266 
267 	/*
268 	 * This is a cloning driver and therefore each queue should only
269 	 * ever get opened once.
270 	 */
271 	if (rq->q_ptr != NULL)
272 		return (EBUSY);
273 
274 	major = getmajor(*devp);
275 	minor = getminor(*devp);
276 
277 	/*
278 	 * Create a new dld_str_t for the stream. This will grab a new minor
279 	 * number that will be handed back in the cloned dev_t.  Creation may
280 	 * fail if we can't allocate the dummy mblk used for flow-control.
281 	 */
282 	dsp = dld_str_create(rq, DLD_DLPI, major,
283 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
284 	if (dsp == NULL)
285 		return (ENOSR);
286 
287 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
288 	if (minor != 0) {
289 		/*
290 		 * Style 1 open
291 		 */
292 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
293 			goto failed;
294 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
295 	} else {
296 		(void) qassociate(rq, -1);
297 	}
298 
299 	/*
300 	 * Enable the queue srv(9e) routine.
301 	 */
302 	qprocson(rq);
303 
304 	/*
305 	 * Construct a cloned dev_t to hand back.
306 	 */
307 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
308 	return (0);
309 
310 failed:
311 	dld_str_destroy(dsp);
312 	return (err);
313 }
314 
315 /*
316  * qi_qclose: close(9e)
317  */
318 int
319 dld_close(queue_t *rq)
320 {
321 	dld_str_t	*dsp = rq->q_ptr;
322 
323 	/*
324 	 * Disable the queue srv(9e) routine.
325 	 */
326 	qprocsoff(rq);
327 
328 	dld_finish_pending_task(dsp);
329 
330 	/*
331 	 * This stream was open to a provider node. Check to see
332 	 * if it has been cleanly shut down.
333 	 */
334 	if (dsp->ds_dlstate != DL_UNATTACHED) {
335 		/*
336 		 * The stream is either open to a style 1 provider or
337 		 * this is not clean shutdown. Detach from the PPA.
338 		 * (This is still ok even in the style 1 case).
339 		 */
340 		dld_str_detach(dsp);
341 	}
342 
343 	dld_str_destroy(dsp);
344 	return (0);
345 }
346 
347 /*
348  * qi_qputp: put(9e)
349  */
350 void
351 dld_wput(queue_t *wq, mblk_t *mp)
352 {
353 	dld_str_t	*dsp = wq->q_ptr;
354 
355 	switch (DB_TYPE(mp)) {
356 	case M_DATA: {
357 		dld_tx_t tx;
358 
359 		DLD_TX_ENTER(dsp);
360 		if ((tx = dsp->ds_tx) != NULL)
361 			tx(dsp, mp);
362 		else
363 			freemsg(mp);
364 		DLD_TX_EXIT(dsp);
365 		break;
366 	}
367 	case M_PROTO:
368 	case M_PCPROTO: {
369 		t_uscalar_t	prim;
370 		dld_tx_t	tx;
371 
372 		if (MBLKL(mp) < sizeof (t_uscalar_t)) {
373 			freemsg(mp);
374 			return;
375 		}
376 
377 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
378 		if (prim != DL_UNITDATA_REQ) {
379 			/* Control path */
380 			dld_wput_nondata(dsp, mp);
381 			break;
382 		}
383 
384 		/* Data path */
385 		DLD_TX_ENTER(dsp);
386 		if ((tx = dsp->ds_unitdata_tx) != NULL)
387 			tx(dsp, mp);
388 		else
389 			dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0);
390 		DLD_TX_EXIT(dsp);
391 		break;
392 	}
393 	case M_IOCTL:
394 	case M_IOCDATA:
395 		/* Control path */
396 		dld_wput_nondata(dsp, mp);
397 		break;
398 	case M_FLUSH:
399 		/*
400 		 * Flush both the data messages and the control messages.
401 		 */
402 		if (*mp->b_rptr & FLUSHW) {
403 			dld_flush_nondata(dsp);
404 			dld_tx_flush(dsp);
405 			*mp->b_rptr &= ~FLUSHW;
406 		}
407 
408 		if (*mp->b_rptr & FLUSHR) {
409 			qreply(wq, mp);
410 		} else {
411 			freemsg(mp);
412 		}
413 		break;
414 	default:
415 		freemsg(mp);
416 		break;
417 	}
418 }
419 
420 /*
421  * Called by GLDv3 control node to process the ioctls. It will start
422  * a taskq to allow the ioctl processing to block. This is a temporary
423  * solution, and will be replaced by a more graceful approach afterwards.
424  */
425 void
426 dld_ioctl(queue_t *wq, mblk_t *mp)
427 {
428 	dld_wput_nondata(wq->q_ptr, mp);
429 }
430 
431 /*
432  * qi_srvp: srv(9e)
433  */
434 void
435 dld_wsrv(queue_t *wq)
436 {
437 	mblk_t		*mp, *head, *tail;
438 	dld_str_t	*dsp = wq->q_ptr;
439 	uint_t		cnt, msgcnt;
440 	timeout_id_t	tid = 0;
441 
442 	rw_enter(&dsp->ds_lock, RW_READER);
443 	/*
444 	 * Grab all packets (chained via b_next) off our transmit queue
445 	 * and try to send them all to the MAC layer.  Since the queue
446 	 * is independent of streams, we are able to dequeue all messages
447 	 * at once without looping through getq() and manually chaining
448 	 * them.  Note that the queue size parameters (byte and message
449 	 * counts) are cleared as well, but we postpone the backenabling
450 	 * until after the MAC transmit since some packets may end up
451 	 * back at our transmit queue.
452 	 */
453 	mutex_enter(&dsp->ds_tx_list_lock);
454 	if ((mp = dsp->ds_tx_list_head) == NULL) {
455 		ASSERT(!dsp->ds_tx_qbusy);
456 		ASSERT(dsp->ds_tx_flow_mp != NULL);
457 		ASSERT(dsp->ds_tx_list_head == NULL);
458 		ASSERT(dsp->ds_tx_list_tail == NULL);
459 		ASSERT(dsp->ds_tx_cnt == 0);
460 		ASSERT(dsp->ds_tx_msgcnt == 0);
461 		mutex_exit(&dsp->ds_tx_list_lock);
462 		rw_exit(&dsp->ds_lock);
463 		return;
464 	}
465 	head = mp;
466 	tail = dsp->ds_tx_list_tail;
467 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
468 	cnt = dsp->ds_tx_cnt;
469 	msgcnt = dsp->ds_tx_msgcnt;
470 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
471 	mutex_exit(&dsp->ds_tx_list_lock);
472 
473 	/*
474 	 * Discard packets unless we are attached and bound; note that
475 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
476 	 * because regardless of the mode all transmit will end up in
477 	 * dld_tx_single() where the packets may be queued.
478 	 */
479 	ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA));
480 	if (dsp->ds_dlstate != DL_IDLE) {
481 		freemsgchain(mp);
482 		goto done;
483 	}
484 
485 	/*
486 	 * Attempt to transmit one or more packets.  If the MAC can't
487 	 * send them all, re-queue the packet(s) at the beginning of
488 	 * the transmit queue to avoid any re-ordering.
489 	 */
490 	mp = dls_tx(dsp->ds_dc, mp);
491 	if (mp == head) {
492 		/*
493 		 * No message was sent out. Take the saved the queue depth
494 		 * as the input, so that dld_tx_enqueue() need not to
495 		 * calculate it again.
496 		 */
497 		dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt);
498 	} else if (mp != NULL) {
499 		/*
500 		 * Some but not all messages were sent out. dld_tx_enqueue()
501 		 * needs to start the timer to calculate the queue depth if
502 		 * timer has not been started.
503 		 *
504 		 * Note that a timer is used to calculate the queue depth
505 		 * to improve network performance, especially for TCP, in
506 		 * which case packets are sent without canput() being checked,
507 		 * and mostly end up in dld_tx_enqueue() under heavy load.
508 		 */
509 		dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0);
510 	}
511 
512 done:
513 	/*
514 	 * Grab the list lock again and check if the transmit queue is
515 	 * really empty; if so, lift up flow-control and backenable any
516 	 * writer queues.  If the queue is not empty, schedule service
517 	 * thread to drain it.
518 	 */
519 	mutex_enter(&dsp->ds_tx_list_lock);
520 	if (dsp->ds_tx_list_head == NULL) {
521 		dsp->ds_tx_flow_mp = getq(wq);
522 		ASSERT(dsp->ds_tx_flow_mp != NULL);
523 		dsp->ds_tx_qbusy = B_FALSE;
524 		if ((tid = dsp->ds_tx_qdepth_tid) != 0)
525 			dsp->ds_tx_qdepth_tid = 0;
526 	}
527 	mutex_exit(&dsp->ds_tx_list_lock);
528 
529 	/*
530 	 * Note that ds_tx_list_lock (which is acquired by the timeout
531 	 * callback routine) cannot be held across the call to untimeout().
532 	 */
533 	if (tid != 0)
534 		(void) untimeout(tid);
535 
536 	rw_exit(&dsp->ds_lock);
537 }
538 
539 void
540 dld_init_ops(struct dev_ops *ops, const char *name)
541 {
542 	struct streamtab *stream;
543 	struct qinit *rq, *wq;
544 	struct module_info *modinfo;
545 
546 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
547 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
548 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
549 	modinfo->mi_minpsz = 0;
550 	modinfo->mi_maxpsz = 64*1024;
551 	modinfo->mi_hiwat  = 1;
552 	modinfo->mi_lowat = 0;
553 
554 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
555 	rq->qi_qopen = dld_open;
556 	rq->qi_qclose = dld_close;
557 	rq->qi_minfo = modinfo;
558 
559 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
560 	wq->qi_putp = (pfi_t)dld_wput;
561 	wq->qi_srvp = (pfi_t)dld_wsrv;
562 	wq->qi_minfo = modinfo;
563 
564 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
565 	stream->st_rdinit = rq;
566 	stream->st_wrinit = wq;
567 	ops->devo_cb_ops->cb_str = stream;
568 
569 	ops->devo_getinfo = &dld_getinfo;
570 }
571 
572 void
573 dld_fini_ops(struct dev_ops *ops)
574 {
575 	struct streamtab *stream;
576 	struct qinit *rq, *wq;
577 	struct module_info *modinfo;
578 
579 	stream = ops->devo_cb_ops->cb_str;
580 	rq = stream->st_rdinit;
581 	wq = stream->st_wrinit;
582 	modinfo = rq->qi_minfo;
583 	ASSERT(wq->qi_minfo == modinfo);
584 
585 	kmem_free(stream, sizeof (struct streamtab));
586 	kmem_free(wq, sizeof (struct qinit));
587 	kmem_free(rq, sizeof (struct qinit));
588 	kmem_free(modinfo->mi_idname, FMNAMESZ);
589 	kmem_free(modinfo, sizeof (struct module_info));
590 }
591 
592 /*
593  * Initialize this module's data structures.
594  */
595 void
596 dld_str_init(void)
597 {
598 	/*
599 	 * Create dld_str_t object cache.
600 	 */
601 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
602 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
603 	ASSERT(str_cachep != NULL);
604 
605 	/*
606 	 * Create taskq to process DLPI requests.
607 	 */
608 	dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2,
609 	    INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
610 
611 	/*
612 	 * Create a hash table for maintaining dld_str_t's.
613 	 * The ds_minor field (the clone minor number) of a dld_str_t
614 	 * is used as a key for this hash table because this number is
615 	 * globally unique (allocated from "dls_minor_arena").
616 	 */
617 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
618 	    mod_hash_null_valdtor);
619 }
620 
621 /*
622  * Tear down this module's data structures.
623  */
624 int
625 dld_str_fini(void)
626 {
627 	/*
628 	 * Make sure that there are no objects in use.
629 	 */
630 	if (str_count != 0)
631 		return (EBUSY);
632 
633 	ASSERT(dld_disp_taskq != NULL);
634 	taskq_destroy(dld_disp_taskq);
635 	dld_disp_taskq = NULL;
636 
637 	/*
638 	 * Destroy object cache.
639 	 */
640 	kmem_cache_destroy(str_cachep);
641 	mod_hash_destroy_idhash(str_hashp);
642 	return (0);
643 }
644 
645 /*
646  * Create a new dld_str_t object.
647  */
648 dld_str_t *
649 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
650 {
651 	dld_str_t	*dsp;
652 	int		err;
653 
654 	/*
655 	 * Allocate an object from the cache.
656 	 */
657 	atomic_add_32(&str_count, 1);
658 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
659 
660 	/*
661 	 * Allocate the dummy mblk for flow-control.
662 	 */
663 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
664 	if (dsp->ds_tx_flow_mp == NULL) {
665 		kmem_cache_free(str_cachep, dsp);
666 		atomic_add_32(&str_count, -1);
667 		return (NULL);
668 	}
669 	dsp->ds_type = type;
670 	dsp->ds_major = major;
671 	dsp->ds_style = style;
672 	dsp->ds_tx = dsp->ds_unitdata_tx = NULL;
673 
674 	/*
675 	 * Initialize the queue pointers.
676 	 */
677 	ASSERT(RD(rq) == rq);
678 	dsp->ds_rq = rq;
679 	dsp->ds_wq = WR(rq);
680 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
681 
682 	/*
683 	 * We want explicit control over our write-side STREAMS queue
684 	 * where the dummy mblk gets added/removed for flow-control.
685 	 */
686 	noenable(WR(rq));
687 
688 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
689 	    (mod_hash_val_t)dsp);
690 	ASSERT(err == 0);
691 	return (dsp);
692 }
693 
694 void
695 dld_finish_pending_task(dld_str_t *dsp)
696 {
697 	/*
698 	 * Wait until the pending requests are processed by the worker thread.
699 	 */
700 	mutex_enter(&dsp->ds_disp_lock);
701 	dsp->ds_closing = B_TRUE;
702 	while (dsp->ds_tid != NULL)
703 		cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock);
704 	dsp->ds_closing = B_FALSE;
705 	mutex_exit(&dsp->ds_disp_lock);
706 }
707 
708 /*
709  * Destroy a dld_str_t object.
710  */
711 void
712 dld_str_destroy(dld_str_t *dsp)
713 {
714 	queue_t		*rq;
715 	queue_t		*wq;
716 	mod_hash_val_t	val;
717 	/*
718 	 * Clear the queue pointers.
719 	 */
720 	rq = dsp->ds_rq;
721 	wq = dsp->ds_wq;
722 	ASSERT(wq == WR(rq));
723 
724 	rq->q_ptr = wq->q_ptr = NULL;
725 	dsp->ds_rq = dsp->ds_wq = NULL;
726 
727 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
728 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
729 	ASSERT(dsp->ds_tx_list_head == NULL);
730 	ASSERT(dsp->ds_tx_list_tail == NULL);
731 	ASSERT(dsp->ds_tx_cnt == 0);
732 	ASSERT(dsp->ds_tx_msgcnt == 0);
733 	ASSERT(dsp->ds_tx_qdepth_tid == 0);
734 	ASSERT(!dsp->ds_tx_qbusy);
735 
736 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
737 	ASSERT(dsp->ds_pending_head == NULL);
738 	ASSERT(dsp->ds_pending_tail == NULL);
739 	ASSERT(dsp->ds_tx == NULL);
740 	ASSERT(dsp->ds_unitdata_tx == NULL);
741 
742 	/*
743 	 * Reinitialize all the flags.
744 	 */
745 	dsp->ds_notifications = 0;
746 	dsp->ds_passivestate = DLD_UNINITIALIZED;
747 	dsp->ds_mode = DLD_UNITDATA;
748 	dsp->ds_native = B_FALSE;
749 
750 	/*
751 	 * Free the dummy mblk if exists.
752 	 */
753 	if (dsp->ds_tx_flow_mp != NULL) {
754 		freeb(dsp->ds_tx_flow_mp);
755 		dsp->ds_tx_flow_mp = NULL;
756 	}
757 
758 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
759 	ASSERT(dsp == (dld_str_t *)val);
760 
761 	/*
762 	 * Free the object back to the cache.
763 	 */
764 	kmem_cache_free(str_cachep, dsp);
765 	atomic_add_32(&str_count, -1);
766 }
767 
768 /*
769  * kmem_cache contructor function: see kmem_cache_create(9f).
770  */
771 /*ARGSUSED*/
772 static int
773 str_constructor(void *buf, void *cdrarg, int kmflags)
774 {
775 	dld_str_t	*dsp = buf;
776 
777 	bzero(buf, sizeof (dld_str_t));
778 
779 	/*
780 	 * Allocate a new minor number.
781 	 */
782 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
783 		return (-1);
784 
785 	/*
786 	 * Initialize the DLPI state machine.
787 	 */
788 	dsp->ds_dlstate = DL_UNATTACHED;
789 
790 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
791 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
792 	mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL);
793 	cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL);
794 	mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL);
795 	cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL);
796 
797 	return (0);
798 }
799 
800 /*
801  * kmem_cache destructor function.
802  */
803 /*ARGSUSED*/
804 static void
805 str_destructor(void *buf, void *cdrarg)
806 {
807 	dld_str_t	*dsp = buf;
808 
809 	/*
810 	 * Make sure the DLPI state machine was reset.
811 	 */
812 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
813 
814 	/*
815 	 * Make sure the data-link interface was closed.
816 	 */
817 	ASSERT(dsp->ds_mh == NULL);
818 	ASSERT(dsp->ds_dc == NULL);
819 	ASSERT(dsp->ds_tx == NULL);
820 	ASSERT(dsp->ds_unitdata_tx == NULL);
821 	ASSERT(dsp->ds_intx_cnt == 0);
822 	ASSERT(dsp->ds_detaching == B_FALSE);
823 
824 	/*
825 	 * Make sure enabled notifications are cleared.
826 	 */
827 	ASSERT(dsp->ds_notifications == 0);
828 
829 	/*
830 	 * Make sure polling is disabled.
831 	 */
832 	ASSERT(!dsp->ds_polling);
833 
834 	/*
835 	 * Release the minor number.
836 	 */
837 	mac_minor_rele(dsp->ds_minor);
838 
839 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
840 	rw_destroy(&dsp->ds_lock);
841 
842 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
843 	mutex_destroy(&dsp->ds_tx_list_lock);
844 	ASSERT(dsp->ds_tx_flow_mp == NULL);
845 	ASSERT(dsp->ds_pending_head == NULL);
846 	ASSERT(dsp->ds_pending_tail == NULL);
847 	ASSERT(!dsp->ds_closing);
848 
849 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
850 	mutex_destroy(&dsp->ds_disp_lock);
851 	cv_destroy(&dsp->ds_disp_cv);
852 
853 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock));
854 	mutex_destroy(&dsp->ds_tx_lock);
855 	cv_destroy(&dsp->ds_tx_cv);
856 }
857 
858 void
859 dld_tx_single(dld_str_t *dsp, mblk_t *mp)
860 {
861 	/*
862 	 * If we are busy enqueue the packet and return.
863 	 * Otherwise hand them over to the MAC driver for transmission.
864 	 * If the message didn't get sent it will be queued.
865 	 *
866 	 * Note here that we don't grab the list lock prior to checking
867 	 * the busy flag.  This is okay, because a missed transition
868 	 * will not cause any packet reordering for any particular TCP
869 	 * connection (which is single-threaded).  The enqueue routine
870 	 * will atomically set the busy flag and schedule the service
871 	 * thread to run; the flag is only cleared by the service thread
872 	 * when there is no more packet to be transmitted.
873 	 */
874 
875 	if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL))
876 		dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp));
877 }
878 
879 /*
880  * Update the priority bits and VID (may need to insert tag if mp points
881  * to an untagged packet).
882  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
883  */
884 static mblk_t *
885 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
886 {
887 	mblk_t *hmp;
888 	struct ether_vlan_header *evhp;
889 	struct ether_header *ehp;
890 	uint16_t old_tci = 0;
891 	size_t len;
892 
893 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
894 
895 	evhp = (struct ether_vlan_header *)mp->b_rptr;
896 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
897 		/*
898 		 * Tagged packet, update the priority bits.
899 		 */
900 		old_tci = ntohs(evhp->ether_tci);
901 		len = sizeof (struct ether_vlan_header);
902 
903 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
904 			/*
905 			 * In case some drivers only check the db_ref
906 			 * count of the first mblk, we pullup the
907 			 * message into a single mblk.
908 			 */
909 			hmp = msgpullup(mp, -1);
910 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
911 				freemsg(hmp);
912 				return (NULL);
913 			} else {
914 				freemsg(mp);
915 				mp = hmp;
916 			}
917 		}
918 
919 		evhp = (struct ether_vlan_header *)mp->b_rptr;
920 	} else {
921 		/*
922 		 * Untagged packet. Insert the special priority tag.
923 		 * First allocate a header mblk.
924 		 */
925 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
926 		if (hmp == NULL)
927 			return (NULL);
928 
929 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
930 		ehp = (struct ether_header *)mp->b_rptr;
931 
932 		/*
933 		 * Copy the MAC addresses and typelen
934 		 */
935 		bcopy(ehp, evhp, (ETHERADDRL * 2));
936 		evhp->ether_type = ehp->ether_type;
937 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
938 
939 		hmp->b_wptr += sizeof (struct ether_vlan_header);
940 		mp->b_rptr += sizeof (struct ether_header);
941 
942 		/*
943 		 * Free the original message if it's now empty. Link the
944 		 * rest of the messages to the header message.
945 		 */
946 		if (MBLKL(mp) == 0) {
947 			hmp->b_cont = mp->b_cont;
948 			freeb(mp);
949 		} else {
950 			hmp->b_cont = mp;
951 		}
952 		mp = hmp;
953 	}
954 
955 	if (pri == 0)
956 		pri = VLAN_PRI(old_tci);
957 	if (vid == VLAN_ID_NONE)
958 		vid = VLAN_ID(old_tci);
959 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
960 	return (mp);
961 }
962 
963 /*
964  * M_DATA put
965  *
966  * The poll callback function for DLS clients which are not in the per-stream
967  * mode. This function is called from an upper layer protocol (currently only
968  * tcp and udp).
969  */
970 void
971 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
972 {
973 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
974 	mblk_t *newmp;
975 	uint_t pri;
976 
977 	if (is_ethernet) {
978 		/*
979 		 * Update the priority bits to the assigned priority.
980 		 */
981 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
982 
983 		if (pri != 0) {
984 			newmp = i_dld_ether_header_update_tag(mp, pri,
985 			    VLAN_ID_NONE);
986 			if (newmp == NULL)
987 				goto discard;
988 			mp = newmp;
989 		}
990 	}
991 
992 	dld_tx_single(dsp, mp);
993 	return;
994 
995 discard:
996 	/* TODO: bump kstat? */
997 	freemsg(mp);
998 }
999 
1000 /*
1001  * M_DATA put (DLIOCRAW mode).
1002  */
1003 void
1004 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
1005 {
1006 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1007 	mblk_t *bp, *newmp;
1008 	size_t size;
1009 	mac_header_info_t mhi;
1010 	uint_t pri, vid;
1011 	uint_t max_sdu;
1012 
1013 	/*
1014 	 * Certain MAC type plugins provide an illusion for raw DLPI
1015 	 * consumers.  They pretend that the MAC layer is something that
1016 	 * it's not for the benefit of observability tools.  For example,
1017 	 * mac_wifi pretends that it's Ethernet for such consumers.
1018 	 * Here, unless native mode is enabled, we call into the MAC layer so
1019 	 * that this illusion can be maintained.  The plugin will optionally
1020 	 * transform the MAC header here into something that can be passed
1021 	 * down.  The header goes from raw mode to "cooked" mode.
1022 	 */
1023 	if (!dsp->ds_native) {
1024 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
1025 			goto discard;
1026 		mp = newmp;
1027 	}
1028 
1029 	size = MBLKL(mp);
1030 
1031 	/*
1032 	 * Check the packet is not too big and that any remaining
1033 	 * fragment list is composed entirely of M_DATA messages. (We
1034 	 * know the first fragment was M_DATA otherwise we could not
1035 	 * have got here).
1036 	 */
1037 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
1038 		if (DB_TYPE(bp) != M_DATA)
1039 			goto discard;
1040 		size += MBLKL(bp);
1041 	}
1042 
1043 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1044 		goto discard;
1045 
1046 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1047 	/*
1048 	 * If LSO is enabled, check the size against lso_max. Otherwise,
1049 	 * compare the packet size with max_sdu.
1050 	 */
1051 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
1052 	if (size > max_sdu + mhi.mhi_hdrsize)
1053 		goto discard;
1054 
1055 	if (is_ethernet) {
1056 		/*
1057 		 * Discard the packet if this is a VLAN stream but the VID in
1058 		 * the packet is not correct.
1059 		 */
1060 		vid = VLAN_ID(mhi.mhi_tci);
1061 		if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
1062 			goto discard;
1063 
1064 		/*
1065 		 * Discard the packet if this packet is a tagged packet
1066 		 * but both pri and VID are 0.
1067 		 */
1068 		pri = VLAN_PRI(mhi.mhi_tci);
1069 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
1070 			goto discard;
1071 
1072 		/*
1073 		 * Update the priority bits to the per-stream priority if
1074 		 * priority is not set in the packet. Update the VID for
1075 		 * packets on a VLAN stream.
1076 		 */
1077 		pri = (pri == 0) ? dsp->ds_pri : 0;
1078 		if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
1079 			if ((newmp = i_dld_ether_header_update_tag(mp,
1080 			    pri, dsp->ds_vid)) == NULL) {
1081 				goto discard;
1082 			}
1083 			mp = newmp;
1084 		}
1085 	}
1086 
1087 	dld_tx_single(dsp, mp);
1088 	return;
1089 
1090 discard:
1091 	/* TODO: bump kstat? */
1092 	freemsg(mp);
1093 }
1094 
1095 /*
1096  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1097  */
1098 int
1099 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1100 {
1101 	dev_t				dev;
1102 	int				err;
1103 	const char			*drvname;
1104 	dls_channel_t			dc;
1105 	uint_t				addr_length;
1106 	boolean_t			qassociated = B_FALSE;
1107 
1108 	ASSERT(dsp->ds_dc == NULL);
1109 
1110 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1111 		return (EINVAL);
1112 
1113 	/*
1114 	 * /dev node access. This will still be supported for backward
1115 	 * compatibility reason.
1116 	 */
1117 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1118 	    (strcmp(drvname, "vnic") != 0)) {
1119 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1120 			return (EINVAL);
1121 		qassociated = B_TRUE;
1122 	}
1123 
1124 	/*
1125 	 * Open a channel.
1126 	 */
1127 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) {
1128 		/*
1129 		 * style-2 VLAN open, this is a /dev VLAN ppa open
1130 		 * which might result in a newly created dls_vlan_t.
1131 		 */
1132 		err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc);
1133 		if (err != 0) {
1134 			if (qassociated)
1135 				(void) qassociate(dsp->ds_wq, -1);
1136 			return (err);
1137 		}
1138 	} else {
1139 		dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1140 		if ((err = dls_open_by_dev(dev, &dc)) != 0) {
1141 			if (qassociated)
1142 				(void) qassociate(dsp->ds_wq, -1);
1143 			return (err);
1144 		}
1145 	}
1146 
1147 	/*
1148 	 * Cache the MAC interface handle, a pointer to the immutable MAC
1149 	 * information and the current and 'factory' MAC address.
1150 	 */
1151 	dsp->ds_mh = dls_mac(dc);
1152 	dsp->ds_mip = mac_info(dsp->ds_mh);
1153 
1154 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1155 
1156 	addr_length = dsp->ds_mip->mi_addr_length;
1157 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
1158 
1159 	/*
1160 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
1161 	 * a non-VLAN interface).
1162 	 */
1163 	dsp->ds_vid = dls_vid(dc);
1164 
1165 	/*
1166 	 * Set the default packet priority.
1167 	 */
1168 	dsp->ds_pri = 0;
1169 
1170 	/*
1171 	 * Add a notify function so that the we get updates from the MAC.
1172 	 */
1173 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
1174 
1175 	dsp->ds_dc = dc;
1176 	dsp->ds_dlstate = DL_UNBOUND;
1177 
1178 	return (0);
1179 }
1180 
1181 /*
1182  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1183  * from close(2) for style 2.
1184  */
1185 void
1186 dld_str_detach(dld_str_t *dsp)
1187 {
1188 	/*
1189 	 * Remove the notify function.
1190 	 */
1191 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
1192 
1193 	/*
1194 	 * Disable the capabilities and clear the promisc flag.
1195 	 */
1196 	ASSERT(!dsp->ds_polling);
1197 	ASSERT(!dsp->ds_soft_ring);
1198 	dld_capabilities_disable(dsp);
1199 	dsp->ds_promisc = 0;
1200 
1201 	DLD_TX_QUIESCE(dsp);
1202 
1203 	/*
1204 	 * Flush all pending packets which are sitting in the transmit queue.
1205 	 */
1206 	dld_tx_flush(dsp);
1207 
1208 	/*
1209 	 * Clear LSO flags.
1210 	 */
1211 	dsp->ds_lso = B_FALSE;
1212 	dsp->ds_lso_max = 0;
1213 
1214 	dls_close(dsp->ds_dc);
1215 	dsp->ds_dc = NULL;
1216 	dsp->ds_mh = NULL;
1217 
1218 	if (dsp->ds_style == DL_STYLE2)
1219 		(void) qassociate(dsp->ds_wq, -1);
1220 
1221 	/*
1222 	 * Re-initialize the DLPI state machine.
1223 	 */
1224 	dsp->ds_dlstate = DL_UNATTACHED;
1225 
1226 }
1227 
1228 /*
1229  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1230  * tags before sending packets up to the DLS clients, with the exception of
1231  * special priority tagged packets, in that case, we set the VID to 0.
1232  * mp must be a VLAN tagged packet.
1233  */
1234 static mblk_t *
1235 i_dld_ether_header_strip_tag(mblk_t *mp)
1236 {
1237 	mblk_t *newmp;
1238 	struct ether_vlan_header *evhp;
1239 	uint16_t tci, new_tci;
1240 
1241 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1242 	if (DB_REF(mp) > 1) {
1243 		newmp = copymsg(mp);
1244 		if (newmp == NULL)
1245 			return (NULL);
1246 		freemsg(mp);
1247 		mp = newmp;
1248 	}
1249 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1250 
1251 	tci = ntohs(evhp->ether_tci);
1252 	if (VLAN_PRI(tci) == 0) {
1253 		/*
1254 		 * Priority is 0, strip the tag.
1255 		 */
1256 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1257 		mp->b_rptr += VLAN_TAGSZ;
1258 	} else {
1259 		/*
1260 		 * Priority is not 0, update the VID to 0.
1261 		 */
1262 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1263 		evhp->ether_tci = htons(new_tci);
1264 	}
1265 	return (mp);
1266 }
1267 
1268 /*
1269  * Raw mode receive function.
1270  */
1271 /*ARGSUSED*/
1272 void
1273 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1274     mac_header_info_t *mhip)
1275 {
1276 	dld_str_t *dsp = (dld_str_t *)arg;
1277 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1278 	mblk_t *next, *newmp;
1279 
1280 	ASSERT(mp != NULL);
1281 	do {
1282 		/*
1283 		 * Get the pointer to the next packet in the chain and then
1284 		 * clear b_next before the packet gets passed on.
1285 		 */
1286 		next = mp->b_next;
1287 		mp->b_next = NULL;
1288 
1289 		/*
1290 		 * Wind back b_rptr to point at the MAC header.
1291 		 */
1292 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1293 		mp->b_rptr -= mhip->mhi_hdrsize;
1294 
1295 		/*
1296 		 * Certain MAC type plugins provide an illusion for raw
1297 		 * DLPI consumers.  They pretend that the MAC layer is
1298 		 * something that it's not for the benefit of observability
1299 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1300 		 * for such consumers.	Here, unless native mode is enabled,
1301 		 * we call into the MAC layer so that this illusion can be
1302 		 * maintained.	The plugin will optionally transform the MAC
1303 		 * header here into something that can be passed up to raw
1304 		 * consumers.  The header goes from "cooked" mode to raw mode.
1305 		 */
1306 		if (!dsp->ds_native) {
1307 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1308 			if (newmp == NULL) {
1309 				freemsg(mp);
1310 				goto next;
1311 			}
1312 			mp = newmp;
1313 		}
1314 
1315 		/*
1316 		 * Strip the VLAN tag for VLAN streams.
1317 		 */
1318 		if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
1319 			newmp = i_dld_ether_header_strip_tag(mp);
1320 			if (newmp == NULL) {
1321 				freemsg(mp);
1322 				goto next;
1323 			}
1324 			mp = newmp;
1325 		}
1326 
1327 		/*
1328 		 * Pass the packet on.
1329 		 */
1330 		if (canputnext(dsp->ds_rq))
1331 			putnext(dsp->ds_rq, mp);
1332 		else
1333 			freemsg(mp);
1334 
1335 next:
1336 		/*
1337 		 * Move on to the next packet in the chain.
1338 		 */
1339 		mp = next;
1340 	} while (mp != NULL);
1341 }
1342 
1343 /*
1344  * Fast-path receive function.
1345  */
1346 /*ARGSUSED*/
1347 void
1348 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1349     mac_header_info_t *mhip)
1350 {
1351 	dld_str_t *dsp = (dld_str_t *)arg;
1352 	mblk_t *next;
1353 	size_t offset = 0;
1354 
1355 	/*
1356 	 * MAC header stripping rules:
1357 	 *    - Tagged packets:
1358 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1359 	 *	b. Physical streams
1360 	 *	- VLAN packets (non-zero VID). The stream must be either a
1361 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1362 	 *	  Strip the Ethernet header but keep the VLAN header.
1363 	 *	- Special tagged packets (zero VID)
1364 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1365 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1366 	 *	    keep the VLAN header.
1367 	 *	  * Otherwise, strip the whole VLAN header.
1368 	 *    - Untagged packets. Strip the whole MAC header.
1369 	 */
1370 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1371 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1372 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1373 		offset = VLAN_TAGSZ;
1374 	}
1375 
1376 	ASSERT(mp != NULL);
1377 	do {
1378 		/*
1379 		 * Get the pointer to the next packet in the chain and then
1380 		 * clear b_next before the packet gets passed on.
1381 		 */
1382 		next = mp->b_next;
1383 		mp->b_next = NULL;
1384 
1385 		/*
1386 		 * Wind back b_rptr to point at the VLAN header.
1387 		 */
1388 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1389 		mp->b_rptr -= offset;
1390 
1391 		/*
1392 		 * Pass the packet on.
1393 		 */
1394 		if (canputnext(dsp->ds_rq))
1395 			putnext(dsp->ds_rq, mp);
1396 		else
1397 			freemsg(mp);
1398 		/*
1399 		 * Move on to the next packet in the chain.
1400 		 */
1401 		mp = next;
1402 	} while (mp != NULL);
1403 }
1404 
1405 /*
1406  * Default receive function (send DL_UNITDATA_IND messages).
1407  */
1408 /*ARGSUSED*/
1409 void
1410 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1411     mac_header_info_t *mhip)
1412 {
1413 	dld_str_t		*dsp = (dld_str_t *)arg;
1414 	mblk_t			*ud_mp;
1415 	mblk_t			*next;
1416 	size_t			offset = 0;
1417 	boolean_t		strip_vlan = B_TRUE;
1418 
1419 	/*
1420 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1421 	 */
1422 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1423 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1424 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1425 		offset = VLAN_TAGSZ;
1426 		strip_vlan = B_FALSE;
1427 	}
1428 
1429 	ASSERT(mp != NULL);
1430 	do {
1431 		/*
1432 		 * Get the pointer to the next packet in the chain and then
1433 		 * clear b_next before the packet gets passed on.
1434 		 */
1435 		next = mp->b_next;
1436 		mp->b_next = NULL;
1437 
1438 		/*
1439 		 * Wind back b_rptr to point at the MAC header.
1440 		 */
1441 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1442 		mp->b_rptr -= mhip->mhi_hdrsize;
1443 
1444 		/*
1445 		 * Create the DL_UNITDATA_IND M_PROTO.
1446 		 */
1447 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1448 			freemsgchain(mp);
1449 			return;
1450 		}
1451 
1452 		/*
1453 		 * Advance b_rptr to point at the payload (or the VLAN header).
1454 		 */
1455 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1456 
1457 		/*
1458 		 * Prepend the DL_UNITDATA_IND.
1459 		 */
1460 		ud_mp->b_cont = mp;
1461 
1462 		/*
1463 		 * Send the message.
1464 		 */
1465 		if (canputnext(dsp->ds_rq))
1466 			putnext(dsp->ds_rq, ud_mp);
1467 		else
1468 			freemsg(ud_mp);
1469 
1470 		/*
1471 		 * Move on to the next packet in the chain.
1472 		 */
1473 		mp = next;
1474 	} while (mp != NULL);
1475 }
1476 
1477 /*
1478  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1479  */
1480 static void
1481 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1482 {
1483 	mblk_t		*mp;
1484 	dl_notify_ind_t *dlip;
1485 
1486 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1487 		return;
1488 
1489 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1490 	    M_PROTO, 0)) == NULL)
1491 		return;
1492 
1493 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1494 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1495 	dlip->dl_primitive = DL_NOTIFY_IND;
1496 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1497 	dlip->dl_data = max_sdu;
1498 
1499 	qreply(dsp->ds_wq, mp);
1500 }
1501 
1502 /*
1503  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1504  * current state of the interface.
1505  */
1506 void
1507 dld_str_notify_ind(dld_str_t *dsp)
1508 {
1509 	mac_notify_type_t	type;
1510 
1511 	for (type = 0; type < MAC_NNOTE; type++)
1512 		str_notify(dsp, type);
1513 }
1514 
1515 typedef struct dl_unitdata_ind_wrapper {
1516 	dl_unitdata_ind_t	dl_unitdata;
1517 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1518 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1519 } dl_unitdata_ind_wrapper_t;
1520 
1521 /*
1522  * Create a DL_UNITDATA_IND M_PROTO message.
1523  */
1524 static mblk_t *
1525 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1526 {
1527 	mblk_t				*nmp;
1528 	dl_unitdata_ind_wrapper_t	*dlwp;
1529 	dl_unitdata_ind_t		*dlp;
1530 	mac_header_info_t		mhi;
1531 	uint_t				addr_length;
1532 	uint8_t				*daddr;
1533 	uint8_t				*saddr;
1534 
1535 	/*
1536 	 * Get the packet header information.
1537 	 */
1538 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1539 		return (NULL);
1540 
1541 	/*
1542 	 * Allocate a message large enough to contain the wrapper structure
1543 	 * defined above.
1544 	 */
1545 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1546 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1547 	    DL_UNITDATA_IND)) == NULL)
1548 		return (NULL);
1549 
1550 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1551 
1552 	dlp = &(dlwp->dl_unitdata);
1553 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1554 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1555 
1556 	/*
1557 	 * Copy in the destination address.
1558 	 */
1559 	addr_length = dsp->ds_mip->mi_addr_length;
1560 	daddr = dlwp->dl_dest_addr;
1561 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1562 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1563 
1564 	/*
1565 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1566 	 */
1567 	if (mhi.mhi_istagged && !strip_vlan)
1568 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1569 	else
1570 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1571 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1572 
1573 	/*
1574 	 * If the destination address was multicast or broadcast then the
1575 	 * dl_group_address field should be non-zero.
1576 	 */
1577 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1578 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1579 
1580 	/*
1581 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1582 	 * for example) may not have access to source information.
1583 	 */
1584 	if (mhi.mhi_saddr == NULL) {
1585 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1586 	} else {
1587 		saddr = dlwp->dl_src_addr;
1588 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1589 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1590 
1591 		/*
1592 		 * Set the source DLSAP to the packet ethertype.
1593 		 */
1594 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1595 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1596 	}
1597 
1598 	return (nmp);
1599 }
1600 
1601 /*
1602  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1603  */
1604 static void
1605 str_notify_promisc_on_phys(dld_str_t *dsp)
1606 {
1607 	mblk_t		*mp;
1608 	dl_notify_ind_t	*dlip;
1609 
1610 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1611 		return;
1612 
1613 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1614 	    M_PROTO, 0)) == NULL)
1615 		return;
1616 
1617 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1618 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1619 	dlip->dl_primitive = DL_NOTIFY_IND;
1620 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1621 
1622 	qreply(dsp->ds_wq, mp);
1623 }
1624 
1625 /*
1626  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1627  */
1628 static void
1629 str_notify_promisc_off_phys(dld_str_t *dsp)
1630 {
1631 	mblk_t		*mp;
1632 	dl_notify_ind_t	*dlip;
1633 
1634 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1635 		return;
1636 
1637 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1638 	    M_PROTO, 0)) == NULL)
1639 		return;
1640 
1641 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1642 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1643 	dlip->dl_primitive = DL_NOTIFY_IND;
1644 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1645 
1646 	qreply(dsp->ds_wq, mp);
1647 }
1648 
1649 /*
1650  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1651  */
1652 static void
1653 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1654 {
1655 	mblk_t		*mp;
1656 	dl_notify_ind_t	*dlip;
1657 	uint_t		addr_length;
1658 	uint16_t	ethertype;
1659 
1660 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1661 		return;
1662 
1663 	addr_length = dsp->ds_mip->mi_addr_length;
1664 	if ((mp = mexchange(dsp->ds_wq, NULL,
1665 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1666 	    M_PROTO, 0)) == NULL)
1667 		return;
1668 
1669 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1670 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1671 	dlip->dl_primitive = DL_NOTIFY_IND;
1672 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1673 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1674 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1675 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1676 
1677 	bcopy(addr, &dlip[1], addr_length);
1678 
1679 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1680 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1681 
1682 	qreply(dsp->ds_wq, mp);
1683 }
1684 
1685 /*
1686  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1687  */
1688 static void
1689 str_notify_link_up(dld_str_t *dsp)
1690 {
1691 	mblk_t		*mp;
1692 	dl_notify_ind_t	*dlip;
1693 
1694 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1695 		return;
1696 
1697 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1698 	    M_PROTO, 0)) == NULL)
1699 		return;
1700 
1701 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1702 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1703 	dlip->dl_primitive = DL_NOTIFY_IND;
1704 	dlip->dl_notification = DL_NOTE_LINK_UP;
1705 
1706 	qreply(dsp->ds_wq, mp);
1707 }
1708 
1709 /*
1710  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1711  */
1712 static void
1713 str_notify_link_down(dld_str_t *dsp)
1714 {
1715 	mblk_t		*mp;
1716 	dl_notify_ind_t	*dlip;
1717 
1718 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1719 		return;
1720 
1721 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1722 	    M_PROTO, 0)) == NULL)
1723 		return;
1724 
1725 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1726 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1727 	dlip->dl_primitive = DL_NOTIFY_IND;
1728 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1729 
1730 	qreply(dsp->ds_wq, mp);
1731 }
1732 
1733 /*
1734  * DL_NOTIFY_IND: DL_NOTE_SPEED
1735  */
1736 static void
1737 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1738 {
1739 	mblk_t		*mp;
1740 	dl_notify_ind_t	*dlip;
1741 
1742 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1743 		return;
1744 
1745 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1746 	    M_PROTO, 0)) == NULL)
1747 		return;
1748 
1749 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1750 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1751 	dlip->dl_primitive = DL_NOTIFY_IND;
1752 	dlip->dl_notification = DL_NOTE_SPEED;
1753 	dlip->dl_data = speed;
1754 
1755 	qreply(dsp->ds_wq, mp);
1756 }
1757 
1758 /*
1759  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1760  */
1761 static void
1762 str_notify_capab_reneg(dld_str_t *dsp)
1763 {
1764 	mblk_t		*mp;
1765 	dl_notify_ind_t	*dlip;
1766 
1767 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1768 		return;
1769 
1770 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1771 	    M_PROTO, 0)) == NULL)
1772 		return;
1773 
1774 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1775 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1776 	dlip->dl_primitive = DL_NOTIFY_IND;
1777 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1778 
1779 	qreply(dsp->ds_wq, mp);
1780 }
1781 
1782 /*
1783  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1784  */
1785 static void
1786 str_notify_fastpath_flush(dld_str_t *dsp)
1787 {
1788 	mblk_t		*mp;
1789 	dl_notify_ind_t	*dlip;
1790 
1791 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1792 		return;
1793 
1794 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1795 	    M_PROTO, 0)) == NULL)
1796 		return;
1797 
1798 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1799 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1800 	dlip->dl_primitive = DL_NOTIFY_IND;
1801 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1802 
1803 	qreply(dsp->ds_wq, mp);
1804 }
1805 
1806 /*
1807  * MAC notification callback.
1808  */
1809 static void
1810 str_notify(void *arg, mac_notify_type_t type)
1811 {
1812 	dld_str_t		*dsp = (dld_str_t *)arg;
1813 	queue_t			*q = dsp->ds_wq;
1814 
1815 	switch (type) {
1816 	case MAC_NOTE_TX:
1817 		qenable(q);
1818 		break;
1819 
1820 	case MAC_NOTE_DEVPROMISC:
1821 		/*
1822 		 * Send the appropriate DL_NOTIFY_IND.
1823 		 */
1824 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1825 			str_notify_promisc_on_phys(dsp);
1826 		else
1827 			str_notify_promisc_off_phys(dsp);
1828 		break;
1829 
1830 	case MAC_NOTE_PROMISC:
1831 		break;
1832 
1833 	case MAC_NOTE_UNICST:
1834 		/*
1835 		 * This notification is sent whenever the MAC unicast address
1836 		 * changes. We need to re-cache the address.
1837 		 */
1838 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1839 
1840 		/*
1841 		 * Send the appropriate DL_NOTIFY_IND.
1842 		 */
1843 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1844 		break;
1845 
1846 	case MAC_NOTE_LINK:
1847 		/*
1848 		 * This notification is sent every time the MAC driver
1849 		 * updates the link state.
1850 		 */
1851 		switch (mac_link_get(dsp->ds_mh)) {
1852 		case LINK_STATE_UP: {
1853 			uint64_t speed;
1854 			/*
1855 			 * The link is up so send the appropriate
1856 			 * DL_NOTIFY_IND.
1857 			 */
1858 			str_notify_link_up(dsp);
1859 
1860 			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
1861 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1862 			break;
1863 		}
1864 		case LINK_STATE_DOWN:
1865 			/*
1866 			 * The link is down so send the appropriate
1867 			 * DL_NOTIFY_IND.
1868 			 */
1869 			str_notify_link_down(dsp);
1870 			break;
1871 
1872 		default:
1873 			break;
1874 		}
1875 		break;
1876 
1877 	case MAC_NOTE_RESOURCE:
1878 	case MAC_NOTE_VNIC:
1879 		/*
1880 		 * This notification is sent whenever the MAC resources
1881 		 * change or capabilities change. We need to renegotiate
1882 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1883 		 */
1884 		str_notify_capab_reneg(dsp);
1885 		break;
1886 
1887 	case MAC_NOTE_SDU_SIZE: {
1888 		uint_t  max_sdu;
1889 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1890 		str_notify_sdu_size(dsp, max_sdu);
1891 		break;
1892 	}
1893 
1894 	case MAC_NOTE_FASTPATH_FLUSH:
1895 		str_notify_fastpath_flush(dsp);
1896 		break;
1897 
1898 	case MAC_NOTE_MARGIN:
1899 		break;
1900 
1901 	default:
1902 		ASSERT(B_FALSE);
1903 		break;
1904 	}
1905 }
1906 
1907 static inline uint_t
1908 mp_getsize(mblk_t *mp)
1909 {
1910 	ASSERT(DB_TYPE(mp) == M_DATA);
1911 	return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp));
1912 }
1913 
1914 /*
1915  * Calculate the dld queue depth, free the messages that exceed the threshold.
1916  */
1917 static void
1918 dld_tx_qdepth_timer(void *arg)
1919 {
1920 	dld_str_t *dsp = (dld_str_t *)arg;
1921 	mblk_t *prev, *mp;
1922 	uint_t cnt, msgcnt, size;
1923 
1924 	mutex_enter(&dsp->ds_tx_list_lock);
1925 
1926 	/* Calculate total size and count of the packet(s) */
1927 	cnt = msgcnt = 0;
1928 	for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL;
1929 	    prev = mp, mp = mp->b_next) {
1930 		size = mp_getsize(mp);
1931 		cnt += size;
1932 		msgcnt++;
1933 		if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) {
1934 			ASSERT(dsp->ds_tx_qbusy);
1935 			dsp->ds_tx_list_tail = prev;
1936 			if (prev == NULL)
1937 				dsp->ds_tx_list_head = NULL;
1938 			else
1939 				prev->b_next = NULL;
1940 			freemsgchain(mp);
1941 			cnt -= size;
1942 			msgcnt--;
1943 			break;
1944 		}
1945 	}
1946 	dsp->ds_tx_cnt = cnt;
1947 	dsp->ds_tx_msgcnt = msgcnt;
1948 	dsp->ds_tx_qdepth_tid = 0;
1949 	mutex_exit(&dsp->ds_tx_list_lock);
1950 }
1951 
1952 /*
1953  * Enqueue one or more messages on the transmit queue. Caller specifies:
1954  *  - the insertion position (head/tail).
1955  *  - the message count and the total message size of messages to be queued
1956  *    if they are known to the caller; or 0 if they are not known.
1957  *
1958  * If the caller does not know the message size information, this usually
1959  * means that dld_wsrv() managed to send some but not all of the queued
1960  * messages. For performance reasons, we do not calculate the queue depth
1961  * every time. Instead, a timer is started to calculate the queue depth
1962  * every 1 second (can be changed by tx_qdepth_interval).
1963  */
1964 static void
1965 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert,
1966     uint_t msgcnt, uint_t cnt)
1967 {
1968 	queue_t *q = dsp->ds_wq;
1969 	uint_t tot_cnt, tot_msgcnt;
1970 	mblk_t *next;
1971 
1972 	mutex_enter(&dsp->ds_tx_list_lock);
1973 
1974 	/*
1975 	 * Simply enqueue the message and calculate the queue depth via
1976 	 * timer if:
1977 	 *
1978 	 * - the current queue depth is incorrect, and the timer is already
1979 	 *   started; or
1980 	 *
1981 	 * - the given message size is unknown and it is allowed to start the
1982 	 *   timer;
1983 	 */
1984 	if ((dsp->ds_tx_qdepth_tid != 0) ||
1985 	    (msgcnt == 0 && tx_qdepth_interval != 0)) {
1986 		goto enqueue;
1987 	}
1988 
1989 	/*
1990 	 * The timer is not allowed, so calculate the message size now.
1991 	 */
1992 	if (msgcnt == 0) {
1993 		for (next = mp; next != NULL; next = next->b_next) {
1994 			cnt += mp_getsize(next);
1995 			msgcnt++;
1996 		}
1997 	}
1998 
1999 	/*
2000 	 * Grow the queue depth using the input messesge size.
2001 	 *
2002 	 * If the queue depth would exceed the allowed threshold, drop
2003 	 * new packet(s) and drain those already in the queue.
2004 	 */
2005 	tot_cnt = dsp->ds_tx_cnt + cnt;
2006 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
2007 
2008 	if (!head_insert && (tot_cnt >= dld_max_q_count ||
2009 	    tot_msgcnt >= dld_max_q_count)) {
2010 		ASSERT(dsp->ds_tx_qbusy);
2011 		mutex_exit(&dsp->ds_tx_list_lock);
2012 		freemsgchain(mp);
2013 		goto done;
2014 	}
2015 	/* Update the queue size parameters */
2016 	dsp->ds_tx_cnt = tot_cnt;
2017 	dsp->ds_tx_msgcnt = tot_msgcnt;
2018 
2019 enqueue:
2020 	/*
2021 	 * If the transmit queue is currently empty and we are
2022 	 * about to deposit the packet(s) there, switch mode to
2023 	 * "busy" and raise flow-control condition.
2024 	 */
2025 	if (!dsp->ds_tx_qbusy) {
2026 		dsp->ds_tx_qbusy = B_TRUE;
2027 		ASSERT(dsp->ds_tx_flow_mp != NULL);
2028 		(void) putq(q, dsp->ds_tx_flow_mp);
2029 		dsp->ds_tx_flow_mp = NULL;
2030 	}
2031 
2032 	if (!head_insert) {
2033 		/* Tail insertion */
2034 		if (dsp->ds_tx_list_head == NULL)
2035 			dsp->ds_tx_list_head = mp;
2036 		else
2037 			dsp->ds_tx_list_tail->b_next = mp;
2038 		dsp->ds_tx_list_tail = tail;
2039 	} else {
2040 		/* Head insertion */
2041 		tail->b_next = dsp->ds_tx_list_head;
2042 		if (dsp->ds_tx_list_head == NULL)
2043 			dsp->ds_tx_list_tail = tail;
2044 		dsp->ds_tx_list_head = mp;
2045 	}
2046 
2047 	if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 &&
2048 	    tx_qdepth_interval != 0) {
2049 		/*
2050 		 * The message size is not given so that we need to start
2051 		 * the timer to calculate the queue depth.
2052 		 */
2053 		dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp,
2054 		    drv_usectohz(tx_qdepth_interval));
2055 		ASSERT(dsp->ds_tx_qdepth_tid != NULL);
2056 	}
2057 	mutex_exit(&dsp->ds_tx_list_lock);
2058 done:
2059 	/* Schedule service thread to drain the transmit queue */
2060 	if (!head_insert)
2061 		qenable(q);
2062 }
2063 
2064 void
2065 dld_tx_flush(dld_str_t *dsp)
2066 {
2067 	timeout_id_t	tid = 0;
2068 
2069 	mutex_enter(&dsp->ds_tx_list_lock);
2070 	if (dsp->ds_tx_list_head != NULL) {
2071 		freemsgchain(dsp->ds_tx_list_head);
2072 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
2073 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
2074 		if (dsp->ds_tx_qbusy) {
2075 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
2076 			ASSERT(dsp->ds_tx_flow_mp != NULL);
2077 			dsp->ds_tx_qbusy = B_FALSE;
2078 		}
2079 		if ((tid = dsp->ds_tx_qdepth_tid) != 0)
2080 			dsp->ds_tx_qdepth_tid = 0;
2081 	}
2082 	mutex_exit(&dsp->ds_tx_list_lock);
2083 
2084 	/*
2085 	 * Note that ds_tx_list_lock (which is acquired by the timeout
2086 	 * callback routine) cannot be held across the call to untimeout().
2087 	 */
2088 	if (tid != 0)
2089 		(void) untimeout(tid);
2090 }
2091 
2092 /*
2093  * Process a non-data message.
2094  */
2095 static void
2096 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
2097 {
2098 	ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) ||
2099 	    (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL));
2100 
2101 	mutex_enter(&dsp->ds_disp_lock);
2102 
2103 	/*
2104 	 * The processing of the message might block. Enqueue the
2105 	 * message for later processing.
2106 	 */
2107 	if (dsp->ds_pending_head == NULL) {
2108 		dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2109 	} else {
2110 		dsp->ds_pending_tail->b_next = mp;
2111 		dsp->ds_pending_tail = mp;
2112 	}
2113 
2114 	/*
2115 	 * If there is no task pending, kick off the task.
2116 	 */
2117 	if (dsp->ds_tid == NULL) {
2118 		dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
2119 		    dld_wput_nondata_task, dsp, TQ_SLEEP);
2120 		ASSERT(dsp->ds_tid != NULL);
2121 	}
2122 	mutex_exit(&dsp->ds_disp_lock);
2123 }
2124 
2125 /*
2126  * The worker thread which processes non-data messages. Note we only process
2127  * one message at one time in order to be able to "flush" the queued message
2128  * and serialize the processing.
2129  */
2130 static void
2131 dld_wput_nondata_task(void *arg)
2132 {
2133 	dld_str_t	*dsp = (dld_str_t *)arg;
2134 	mblk_t		*mp;
2135 
2136 	mutex_enter(&dsp->ds_disp_lock);
2137 	ASSERT(dsp->ds_pending_head != NULL);
2138 	ASSERT(dsp->ds_tid != NULL);
2139 
2140 	if (dsp->ds_closing)
2141 		goto closing;
2142 
2143 	mp = dsp->ds_pending_head;
2144 	if ((dsp->ds_pending_head = mp->b_next) == NULL)
2145 		dsp->ds_pending_tail = NULL;
2146 	mp->b_next = NULL;
2147 
2148 	mutex_exit(&dsp->ds_disp_lock);
2149 
2150 	switch (DB_TYPE(mp)) {
2151 	case M_PROTO:
2152 	case M_PCPROTO:
2153 		ASSERT(dsp->ds_type == DLD_DLPI);
2154 		dld_wput_proto_nondata(dsp, mp);
2155 		break;
2156 	case M_IOCTL: {
2157 		uint_t cmd;
2158 
2159 		if (dsp->ds_type == DLD_CONTROL) {
2160 			ASSERT(dsp->ds_ioctl != NULL);
2161 			dsp->ds_ioctl(dsp->ds_wq, mp);
2162 			break;
2163 		}
2164 
2165 		cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2166 
2167 		switch (cmd) {
2168 		case DLIOCNATIVE:
2169 			ioc_native(dsp, mp);
2170 			break;
2171 		case DLIOCMARGININFO:
2172 			ioc_margin(dsp, mp);
2173 			break;
2174 		case DLIOCRAW:
2175 			ioc_raw(dsp, mp);
2176 			break;
2177 		case DLIOCHDRINFO:
2178 			ioc_fast(dsp, mp);
2179 			break;
2180 		default:
2181 			ioc(dsp, mp);
2182 			break;
2183 		}
2184 		break;
2185 	}
2186 	case M_IOCDATA:
2187 		ASSERT(dsp->ds_type == DLD_DLPI);
2188 		ioc(dsp, mp);
2189 		break;
2190 	}
2191 
2192 	mutex_enter(&dsp->ds_disp_lock);
2193 
2194 	if (dsp->ds_closing)
2195 		goto closing;
2196 
2197 	if (dsp->ds_pending_head != NULL) {
2198 		dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
2199 		    dld_wput_nondata_task, dsp, TQ_SLEEP);
2200 		ASSERT(dsp->ds_tid != NULL);
2201 	} else {
2202 		dsp->ds_tid = NULL;
2203 	}
2204 	mutex_exit(&dsp->ds_disp_lock);
2205 	return;
2206 
2207 	/*
2208 	 * If the stream is closing, flush all queued messages and inform
2209 	 * the stream once it is done.
2210 	 */
2211 closing:
2212 	freemsgchain(dsp->ds_pending_head);
2213 	dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
2214 	dsp->ds_tid = NULL;
2215 	cv_signal(&dsp->ds_disp_cv);
2216 	mutex_exit(&dsp->ds_disp_lock);
2217 }
2218 
2219 /*
2220  * Flush queued non-data messages.
2221  */
2222 static void
2223 dld_flush_nondata(dld_str_t *dsp)
2224 {
2225 	mutex_enter(&dsp->ds_disp_lock);
2226 	freemsgchain(dsp->ds_pending_head);
2227 	dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
2228 	mutex_exit(&dsp->ds_disp_lock);
2229 }
2230 
2231 /*
2232  * DLIOCNATIVE
2233  */
2234 static void
2235 ioc_native(dld_str_t *dsp, mblk_t *mp)
2236 {
2237 	queue_t *q = dsp->ds_wq;
2238 	const mac_info_t *mip = dsp->ds_mip;
2239 
2240 	rw_enter(&dsp->ds_lock, RW_WRITER);
2241 
2242 	/*
2243 	 * Native mode can be enabled if it's disabled and if the
2244 	 * native media type is different.
2245 	 */
2246 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2247 		dsp->ds_native = B_TRUE;
2248 
2249 	rw_exit(&dsp->ds_lock);
2250 
2251 	if (dsp->ds_native)
2252 		miocack(q, mp, 0, mip->mi_nativemedia);
2253 	else
2254 		miocnak(q, mp, 0, ENOTSUP);
2255 }
2256 
2257 /*
2258  * DLIOCMARGININFO
2259  */
2260 static void
2261 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2262 {
2263 	queue_t *q = dsp->ds_wq;
2264 	uint32_t margin;
2265 	int err;
2266 
2267 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2268 		err = EINVAL;
2269 		goto failed;
2270 	}
2271 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2272 		goto failed;
2273 
2274 	mac_margin_get(dsp->ds_mh, &margin);
2275 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2276 	miocack(q, mp, sizeof (uint32_t), 0);
2277 	return;
2278 
2279 failed:
2280 	miocnak(q, mp, 0, err);
2281 }
2282 
2283 /*
2284  * DLIOCRAW
2285  */
2286 static void
2287 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2288 {
2289 	queue_t *q = dsp->ds_wq;
2290 
2291 	if (dsp->ds_polling || dsp->ds_soft_ring) {
2292 		miocnak(q, mp, 0, EPROTO);
2293 		return;
2294 	}
2295 
2296 	rw_enter(&dsp->ds_lock, RW_WRITER);
2297 	if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) {
2298 		/*
2299 		 * Set the receive callback.
2300 		 */
2301 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp);
2302 		dsp->ds_tx = str_mdata_raw_put;
2303 	}
2304 	dsp->ds_mode = DLD_RAW;
2305 	rw_exit(&dsp->ds_lock);
2306 	miocack(q, mp, 0, 0);
2307 }
2308 
2309 /*
2310  * DLIOCHDRINFO
2311  */
2312 static void
2313 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2314 {
2315 	dl_unitdata_req_t *dlp;
2316 	off_t		off;
2317 	size_t		len;
2318 	const uint8_t	*addr;
2319 	uint16_t	sap;
2320 	mblk_t		*nmp;
2321 	mblk_t		*hmp;
2322 	uint_t		addr_length;
2323 	queue_t		*q = dsp->ds_wq;
2324 	int		err;
2325 
2326 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2327 		err = ENOTSUP;
2328 		goto failed;
2329 	}
2330 
2331 	/*
2332 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2333 	 * user-land should not be allowed.
2334 	 */
2335 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2336 		err = EINVAL;
2337 		goto failed;
2338 	}
2339 
2340 	nmp = mp->b_cont;
2341 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2342 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2343 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2344 		err = EINVAL;
2345 		goto failed;
2346 	}
2347 
2348 	off = dlp->dl_dest_addr_offset;
2349 	len = dlp->dl_dest_addr_length;
2350 
2351 	if (!MBLKIN(nmp, off, len)) {
2352 		err = EINVAL;
2353 		goto failed;
2354 	}
2355 
2356 	/*
2357 	 * We don't need to hold any locks to access ds_dlstate, because
2358 	 * control message prossessing (which updates this field) is
2359 	 * serialized.
2360 	 */
2361 	if (dsp->ds_dlstate != DL_IDLE) {
2362 		err = ENOTSUP;
2363 		goto failed;
2364 	}
2365 
2366 	addr_length = dsp->ds_mip->mi_addr_length;
2367 	if (len != addr_length + sizeof (uint16_t)) {
2368 		err = EINVAL;
2369 		goto failed;
2370 	}
2371 
2372 	addr = nmp->b_rptr + off;
2373 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2374 
2375 	if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) {
2376 		err = ENOMEM;
2377 		goto failed;
2378 	}
2379 
2380 	rw_enter(&dsp->ds_lock, RW_WRITER);
2381 	ASSERT(dsp->ds_dlstate == DL_IDLE);
2382 	if (dsp->ds_mode != DLD_FASTPATH) {
2383 		/*
2384 		 * Set the receive callback (unless polling or
2385 		 * soft-ring is enabled).
2386 		 */
2387 		dsp->ds_mode = DLD_FASTPATH;
2388 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
2389 			dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp);
2390 		dsp->ds_tx = str_mdata_fastpath_put;
2391 	}
2392 	rw_exit(&dsp->ds_lock);
2393 
2394 	freemsg(nmp->b_cont);
2395 	nmp->b_cont = hmp;
2396 
2397 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2398 	return;
2399 failed:
2400 	miocnak(q, mp, 0, err);
2401 }
2402 
2403 static void
2404 ioc(dld_str_t *dsp, mblk_t *mp)
2405 {
2406 	queue_t	*q = dsp->ds_wq;
2407 	mac_handle_t mh;
2408 
2409 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2410 		miocnak(q, mp, 0, EINVAL);
2411 		return;
2412 	}
2413 	mh = dsp->ds_mh;
2414 	ASSERT(mh != NULL);
2415 	mac_ioctl(mh, q, mp);
2416 }
2417