xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision 19397407)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<sys/stropts.h>
31 #include	<sys/strsun.h>
32 #include	<sys/strsubr.h>
33 #include	<sys/atomic.h>
34 #include	<sys/disp.h>
35 #include	<sys/callb.h>
36 #include	<sys/vlan.h>
37 #include	<sys/dld.h>
38 #include	<sys/dld_impl.h>
39 #include	<sys/dls_impl.h>
40 #include	<inet/common.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 static void	str_notify(void *, mac_notify_type_t);
53 
54 static void	ioc_native(dld_str_t *,  mblk_t *);
55 static void	ioc_margin(dld_str_t *, mblk_t *);
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t,
60 		    uint_t, uint_t);
61 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
62 static void	dld_wput_nondata_task(void *);
63 static void	dld_flush_nondata(dld_str_t *);
64 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static taskq_t		*dld_disp_taskq = NULL;
70 static mod_hash_t	*str_hashp;
71 
72 #define	STR_HASHSZ		64
73 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
74 
75 static inline uint_t	mp_getsize(mblk_t *);
76 
77 /*
78  * Interval to count the TX queued depth. Default is 1s (1000000us).
79  * Count the queue depth immediately (not by timeout) if this is set to 0.
80  * See more details above dld_tx_enqueue().
81  */
82 uint_t tx_qdepth_interval = 1000000;
83 
84 /*
85  * Some notes on entry points, flow-control, queueing and locking:
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in dld_tx_single(), where they will be delivered to the MAC
93  * driver.
94  *
95  * The transmit logic operates in two modes: a "not busy" mode where the
96  * packets will be delivered to the MAC for a send attempt, or "busy" mode
97  * where they will be enqueued in the internal queue because of flow-control.
98  * Flow-control happens when the MAC driver indicates the packets couldn't
99  * be transmitted due to lack of resources (e.g. running out of descriptors).
100  * In such case, the driver will place a dummy message on its write-side
101  * STREAMS queue so that the queue is marked as "full".  Any subsequent
102  * packets arriving at the driver will be enqueued in the internal queue,
103  * which is drained in the context of the service thread that gets scheduled
104  * whenever the driver is in the "busy" mode.  When all packets have been
105  * successfully delivered by MAC and the internal queue is empty, it will
106  * transition to the "not busy" mode by removing the dummy message from the
107  * write-side STREAMS queue; in effect this will trigger backenabling.
108  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
109  * to the above reasons.
110  *
111  * The driver implements an internal transmit queue independent of STREAMS.
112  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
113  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
114  * getq() operations done by the driver are those related to placing and
115  * removing the dummy message to/from the write-side STREAMS queue for flow-
116  * control purposes.
117  *
118  * Locking is done independent of STREAMS due to the driver being fully MT.
119  * Threads entering the driver (either from put or service entry points)
120  * will most likely be readers, with the exception of a few writer cases
121  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
122  * DLD-related ioctl requests.  The DLPI detach case is special, because
123  * it involves freeing resources and therefore must be single-threaded.
124  * Unfortunately the readers/writers lock can't be used to protect against
125  * it, because the lock is dropped prior to the driver calling places where
126  * putnext() may be invoked, and such places may depend on those resources
127  * to exist.  Because of this, the driver always completes the DLPI detach
128  * process when there are no other threads running in the driver.  This is
129  * done by keeping track of the number of threads, such that the the last
130  * thread leaving the driver will finish the pending DLPI detach operation.
131  */
132 
133 /*
134  * dld_max_q_count is the queue depth threshold used to limit the number of
135  * outstanding packets or bytes allowed in the queue; once this limit is
136  * reached the driver will free any incoming ones until the queue depth
137  * drops below the threshold.
138  *
139  * This buffering is provided to accomodate clients which do not employ
140  * their own buffering scheme, and to handle occasional packet bursts.
141  * Clients which handle their own buffering will receive positive feedback
142  * from this driver as soon as it transitions into the "busy" state, i.e.
143  * when the queue is initially filled up; they will get backenabled once
144  * the queue is empty.
145  *
146  * The value chosen here is rather arbitrary; in future some intelligent
147  * heuristics may be involved which could take into account the hardware's
148  * transmit ring size, etc.
149  */
150 uint_t dld_max_q_count = (16 * 1024 *1024);
151 
152 /*
153  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
154  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
155  * match dev_t. If a stream is found and it is attached, its dev_info_t *
156  * is returned.
157  */
158 typedef struct i_dld_str_state_s {
159 	major_t		ds_major;
160 	minor_t		ds_minor;
161 	dev_info_t	*ds_dip;
162 } i_dld_str_state_t;
163 
164 /* ARGSUSED */
165 static uint_t
166 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
167 {
168 	i_dld_str_state_t	*statep = arg;
169 	dld_str_t		*dsp = (dld_str_t *)val;
170 
171 	if (statep->ds_major != dsp->ds_major)
172 		return (MH_WALK_CONTINUE);
173 
174 	ASSERT(statep->ds_minor != 0);
175 
176 	/*
177 	 * Access to ds_mh needs to be protected by ds_lock.
178 	 */
179 	rw_enter(&dsp->ds_lock, RW_READER);
180 	if (statep->ds_minor == dsp->ds_minor) {
181 		/*
182 		 * Clone: a clone minor is unique. we can terminate the
183 		 * walk if we find a matching stream -- even if we fail
184 		 * to obtain the devinfo.
185 		 */
186 		if (dsp->ds_mh != NULL)
187 			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
188 		rw_exit(&dsp->ds_lock);
189 		return (MH_WALK_TERMINATE);
190 	}
191 	rw_exit(&dsp->ds_lock);
192 	return (MH_WALK_CONTINUE);
193 }
194 
195 static dev_info_t *
196 dld_finddevinfo(dev_t dev)
197 {
198 	dev_info_t	*dip;
199 	i_dld_str_state_t	state;
200 
201 	if (getminor(dev) == 0)
202 		return (NULL);
203 
204 	/*
205 	 * See if it's a minor node of a link
206 	 */
207 	if ((dip = dls_finddevinfo(dev)) != NULL)
208 		return (dip);
209 
210 	state.ds_minor = getminor(dev);
211 	state.ds_major = getmajor(dev);
212 	state.ds_dip = NULL;
213 
214 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
215 	return (state.ds_dip);
216 }
217 
218 /*
219  * devo_getinfo: getinfo(9e)
220  */
221 /*ARGSUSED*/
222 int
223 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
224 {
225 	dev_info_t	*devinfo;
226 	minor_t		minor = getminor((dev_t)arg);
227 	int		rc = DDI_FAILURE;
228 
229 	switch (cmd) {
230 	case DDI_INFO_DEVT2DEVINFO:
231 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
232 			*(dev_info_t **)resp = devinfo;
233 			rc = DDI_SUCCESS;
234 		}
235 		break;
236 	case DDI_INFO_DEVT2INSTANCE:
237 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
238 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
239 			rc = DDI_SUCCESS;
240 		} else if (minor > DLS_MAX_MINOR &&
241 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
242 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
243 			rc = DDI_SUCCESS;
244 		}
245 		break;
246 	}
247 	return (rc);
248 }
249 
250 /*
251  * qi_qopen: open(9e)
252  */
253 /*ARGSUSED*/
254 int
255 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
256 {
257 	dld_str_t	*dsp;
258 	major_t		major;
259 	minor_t		minor;
260 	int		err;
261 
262 	if (sflag == MODOPEN)
263 		return (ENOTSUP);
264 
265 	/*
266 	 * This is a cloning driver and therefore each queue should only
267 	 * ever get opened once.
268 	 */
269 	if (rq->q_ptr != NULL)
270 		return (EBUSY);
271 
272 	major = getmajor(*devp);
273 	minor = getminor(*devp);
274 
275 	/*
276 	 * Create a new dld_str_t for the stream. This will grab a new minor
277 	 * number that will be handed back in the cloned dev_t.  Creation may
278 	 * fail if we can't allocate the dummy mblk used for flow-control.
279 	 */
280 	dsp = dld_str_create(rq, DLD_DLPI, major,
281 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
282 	if (dsp == NULL)
283 		return (ENOSR);
284 
285 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
286 	if (minor != 0) {
287 		/*
288 		 * Style 1 open
289 		 */
290 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
291 			goto failed;
292 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
293 	} else {
294 		(void) qassociate(rq, -1);
295 	}
296 
297 	/*
298 	 * Enable the queue srv(9e) routine.
299 	 */
300 	qprocson(rq);
301 
302 	/*
303 	 * Construct a cloned dev_t to hand back.
304 	 */
305 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
306 	return (0);
307 
308 failed:
309 	dld_str_destroy(dsp);
310 	return (err);
311 }
312 
313 /*
314  * qi_qclose: close(9e)
315  */
316 int
317 dld_close(queue_t *rq)
318 {
319 	dld_str_t	*dsp = rq->q_ptr;
320 
321 	/*
322 	 * Disable the queue srv(9e) routine.
323 	 */
324 	qprocsoff(rq);
325 
326 	dld_finish_pending_task(dsp);
327 
328 	/*
329 	 * This stream was open to a provider node. Check to see
330 	 * if it has been cleanly shut down.
331 	 */
332 	if (dsp->ds_dlstate != DL_UNATTACHED) {
333 		/*
334 		 * The stream is either open to a style 1 provider or
335 		 * this is not clean shutdown. Detach from the PPA.
336 		 * (This is still ok even in the style 1 case).
337 		 */
338 		dld_str_detach(dsp);
339 	}
340 
341 	dld_str_destroy(dsp);
342 	return (0);
343 }
344 
345 /*
346  * qi_qputp: put(9e)
347  */
348 void
349 dld_wput(queue_t *wq, mblk_t *mp)
350 {
351 	dld_str_t	*dsp = wq->q_ptr;
352 
353 	switch (DB_TYPE(mp)) {
354 	case M_DATA: {
355 		dld_tx_t tx;
356 
357 		DLD_TX_ENTER(dsp);
358 		if ((tx = dsp->ds_tx) != NULL)
359 			tx(dsp, mp);
360 		else
361 			freemsg(mp);
362 		DLD_TX_EXIT(dsp);
363 		break;
364 	}
365 	case M_PROTO:
366 	case M_PCPROTO: {
367 		t_uscalar_t	prim;
368 		dld_tx_t	tx;
369 
370 		if (MBLKL(mp) < sizeof (t_uscalar_t)) {
371 			freemsg(mp);
372 			return;
373 		}
374 
375 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
376 		if (prim != DL_UNITDATA_REQ) {
377 			/* Control path */
378 			dld_wput_nondata(dsp, mp);
379 			break;
380 		}
381 
382 		/* Data path */
383 		DLD_TX_ENTER(dsp);
384 		if ((tx = dsp->ds_unitdata_tx) != NULL)
385 			tx(dsp, mp);
386 		else
387 			dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0);
388 		DLD_TX_EXIT(dsp);
389 		break;
390 	}
391 	case M_IOCTL:
392 	case M_IOCDATA:
393 		/* Control path */
394 		dld_wput_nondata(dsp, mp);
395 		break;
396 	case M_FLUSH:
397 		/*
398 		 * Flush both the data messages and the control messages.
399 		 */
400 		if (*mp->b_rptr & FLUSHW) {
401 			dld_flush_nondata(dsp);
402 			dld_tx_flush(dsp);
403 			*mp->b_rptr &= ~FLUSHW;
404 		}
405 
406 		if (*mp->b_rptr & FLUSHR) {
407 			qreply(wq, mp);
408 		} else {
409 			freemsg(mp);
410 		}
411 		break;
412 	default:
413 		freemsg(mp);
414 		break;
415 	}
416 }
417 
418 /*
419  * Called by GLDv3 control node to process the ioctls. It will start
420  * a taskq to allow the ioctl processing to block. This is a temporary
421  * solution, and will be replaced by a more graceful approach afterwards.
422  */
423 void
424 dld_ioctl(queue_t *wq, mblk_t *mp)
425 {
426 	dld_wput_nondata(wq->q_ptr, mp);
427 }
428 
429 /*
430  * qi_srvp: srv(9e)
431  */
432 void
433 dld_wsrv(queue_t *wq)
434 {
435 	mblk_t		*mp, *head, *tail;
436 	dld_str_t	*dsp = wq->q_ptr;
437 	uint_t		cnt, msgcnt;
438 	timeout_id_t	tid = 0;
439 
440 	rw_enter(&dsp->ds_lock, RW_READER);
441 	/*
442 	 * Grab all packets (chained via b_next) off our transmit queue
443 	 * and try to send them all to the MAC layer.  Since the queue
444 	 * is independent of streams, we are able to dequeue all messages
445 	 * at once without looping through getq() and manually chaining
446 	 * them.  Note that the queue size parameters (byte and message
447 	 * counts) are cleared as well, but we postpone the backenabling
448 	 * until after the MAC transmit since some packets may end up
449 	 * back at our transmit queue.
450 	 */
451 	mutex_enter(&dsp->ds_tx_list_lock);
452 	if ((mp = dsp->ds_tx_list_head) == NULL) {
453 		ASSERT(!dsp->ds_tx_qbusy);
454 		ASSERT(dsp->ds_tx_flow_mp != NULL);
455 		ASSERT(dsp->ds_tx_list_head == NULL);
456 		ASSERT(dsp->ds_tx_list_tail == NULL);
457 		ASSERT(dsp->ds_tx_cnt == 0);
458 		ASSERT(dsp->ds_tx_msgcnt == 0);
459 		mutex_exit(&dsp->ds_tx_list_lock);
460 		rw_exit(&dsp->ds_lock);
461 		return;
462 	}
463 	head = mp;
464 	tail = dsp->ds_tx_list_tail;
465 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
466 	cnt = dsp->ds_tx_cnt;
467 	msgcnt = dsp->ds_tx_msgcnt;
468 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
469 	mutex_exit(&dsp->ds_tx_list_lock);
470 
471 	/*
472 	 * Discard packets unless we are attached and bound; note that
473 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
474 	 * because regardless of the mode all transmit will end up in
475 	 * dld_tx_single() where the packets may be queued.
476 	 */
477 	ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA));
478 	if (dsp->ds_dlstate != DL_IDLE) {
479 		freemsgchain(mp);
480 		goto done;
481 	}
482 
483 	/*
484 	 * Attempt to transmit one or more packets.  If the MAC can't
485 	 * send them all, re-queue the packet(s) at the beginning of
486 	 * the transmit queue to avoid any re-ordering.
487 	 */
488 	mp = dls_tx(dsp->ds_dc, mp);
489 	if (mp == head) {
490 		/*
491 		 * No message was sent out. Take the saved the queue depth
492 		 * as the input, so that dld_tx_enqueue() need not to
493 		 * calculate it again.
494 		 */
495 		dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt);
496 	} else if (mp != NULL) {
497 		/*
498 		 * Some but not all messages were sent out. dld_tx_enqueue()
499 		 * needs to start the timer to calculate the queue depth if
500 		 * timer has not been started.
501 		 *
502 		 * Note that a timer is used to calculate the queue depth
503 		 * to improve network performance, especially for TCP, in
504 		 * which case packets are sent without canput() being checked,
505 		 * and mostly end up in dld_tx_enqueue() under heavy load.
506 		 */
507 		dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0);
508 	}
509 
510 done:
511 	/*
512 	 * Grab the list lock again and check if the transmit queue is
513 	 * really empty; if so, lift up flow-control and backenable any
514 	 * writer queues.  If the queue is not empty, schedule service
515 	 * thread to drain it.
516 	 */
517 	mutex_enter(&dsp->ds_tx_list_lock);
518 	if (dsp->ds_tx_list_head == NULL) {
519 		dsp->ds_tx_flow_mp = getq(wq);
520 		ASSERT(dsp->ds_tx_flow_mp != NULL);
521 		dsp->ds_tx_qbusy = B_FALSE;
522 		if ((tid = dsp->ds_tx_qdepth_tid) != 0)
523 			dsp->ds_tx_qdepth_tid = 0;
524 	}
525 	mutex_exit(&dsp->ds_tx_list_lock);
526 
527 	/*
528 	 * Note that ds_tx_list_lock (which is acquired by the timeout
529 	 * callback routine) cannot be held across the call to untimeout().
530 	 */
531 	if (tid != 0)
532 		(void) untimeout(tid);
533 
534 	rw_exit(&dsp->ds_lock);
535 }
536 
537 void
538 dld_init_ops(struct dev_ops *ops, const char *name)
539 {
540 	struct streamtab *stream;
541 	struct qinit *rq, *wq;
542 	struct module_info *modinfo;
543 
544 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
545 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
546 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
547 	modinfo->mi_minpsz = 0;
548 	modinfo->mi_maxpsz = 64*1024;
549 	modinfo->mi_hiwat  = 1;
550 	modinfo->mi_lowat = 0;
551 
552 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
553 	rq->qi_qopen = dld_open;
554 	rq->qi_qclose = dld_close;
555 	rq->qi_minfo = modinfo;
556 
557 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
558 	wq->qi_putp = (pfi_t)dld_wput;
559 	wq->qi_srvp = (pfi_t)dld_wsrv;
560 	wq->qi_minfo = modinfo;
561 
562 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
563 	stream->st_rdinit = rq;
564 	stream->st_wrinit = wq;
565 	ops->devo_cb_ops->cb_str = stream;
566 
567 	if (ops->devo_getinfo == NULL)
568 		ops->devo_getinfo = &dld_getinfo;
569 }
570 
571 void
572 dld_fini_ops(struct dev_ops *ops)
573 {
574 	struct streamtab *stream;
575 	struct qinit *rq, *wq;
576 	struct module_info *modinfo;
577 
578 	stream = ops->devo_cb_ops->cb_str;
579 	rq = stream->st_rdinit;
580 	wq = stream->st_wrinit;
581 	modinfo = rq->qi_minfo;
582 	ASSERT(wq->qi_minfo == modinfo);
583 
584 	kmem_free(stream, sizeof (struct streamtab));
585 	kmem_free(wq, sizeof (struct qinit));
586 	kmem_free(rq, sizeof (struct qinit));
587 	kmem_free(modinfo->mi_idname, FMNAMESZ);
588 	kmem_free(modinfo, sizeof (struct module_info));
589 }
590 
591 /*
592  * Initialize this module's data structures.
593  */
594 void
595 dld_str_init(void)
596 {
597 	/*
598 	 * Create dld_str_t object cache.
599 	 */
600 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
601 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
602 	ASSERT(str_cachep != NULL);
603 
604 	/*
605 	 * Create taskq to process DLPI requests.
606 	 */
607 	dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2,
608 	    INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
609 
610 	/*
611 	 * Create a hash table for maintaining dld_str_t's.
612 	 * The ds_minor field (the clone minor number) of a dld_str_t
613 	 * is used as a key for this hash table because this number is
614 	 * globally unique (allocated from "dls_minor_arena").
615 	 */
616 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
617 	    mod_hash_null_valdtor);
618 }
619 
620 /*
621  * Tear down this module's data structures.
622  */
623 int
624 dld_str_fini(void)
625 {
626 	/*
627 	 * Make sure that there are no objects in use.
628 	 */
629 	if (str_count != 0)
630 		return (EBUSY);
631 
632 	ASSERT(dld_disp_taskq != NULL);
633 	taskq_destroy(dld_disp_taskq);
634 	dld_disp_taskq = NULL;
635 
636 	/*
637 	 * Destroy object cache.
638 	 */
639 	kmem_cache_destroy(str_cachep);
640 	mod_hash_destroy_idhash(str_hashp);
641 	return (0);
642 }
643 
644 /*
645  * Create a new dld_str_t object.
646  */
647 dld_str_t *
648 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
649 {
650 	dld_str_t	*dsp;
651 	int		err;
652 
653 	/*
654 	 * Allocate an object from the cache.
655 	 */
656 	atomic_add_32(&str_count, 1);
657 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
658 
659 	/*
660 	 * Allocate the dummy mblk for flow-control.
661 	 */
662 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
663 	if (dsp->ds_tx_flow_mp == NULL) {
664 		kmem_cache_free(str_cachep, dsp);
665 		atomic_add_32(&str_count, -1);
666 		return (NULL);
667 	}
668 	dsp->ds_type = type;
669 	dsp->ds_major = major;
670 	dsp->ds_style = style;
671 	dsp->ds_tx = dsp->ds_unitdata_tx = NULL;
672 
673 	/*
674 	 * Initialize the queue pointers.
675 	 */
676 	ASSERT(RD(rq) == rq);
677 	dsp->ds_rq = rq;
678 	dsp->ds_wq = WR(rq);
679 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
680 
681 	/*
682 	 * We want explicit control over our write-side STREAMS queue
683 	 * where the dummy mblk gets added/removed for flow-control.
684 	 */
685 	noenable(WR(rq));
686 
687 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
688 	    (mod_hash_val_t)dsp);
689 	ASSERT(err == 0);
690 	return (dsp);
691 }
692 
693 void
694 dld_finish_pending_task(dld_str_t *dsp)
695 {
696 	/*
697 	 * Wait until the pending requests are processed by the worker thread.
698 	 */
699 	mutex_enter(&dsp->ds_disp_lock);
700 	dsp->ds_closing = B_TRUE;
701 	while (dsp->ds_tid != NULL)
702 		cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock);
703 	dsp->ds_closing = B_FALSE;
704 	mutex_exit(&dsp->ds_disp_lock);
705 }
706 
707 /*
708  * Destroy a dld_str_t object.
709  */
710 void
711 dld_str_destroy(dld_str_t *dsp)
712 {
713 	queue_t		*rq;
714 	queue_t		*wq;
715 	mod_hash_val_t	val;
716 	/*
717 	 * Clear the queue pointers.
718 	 */
719 	rq = dsp->ds_rq;
720 	wq = dsp->ds_wq;
721 	ASSERT(wq == WR(rq));
722 
723 	rq->q_ptr = wq->q_ptr = NULL;
724 	dsp->ds_rq = dsp->ds_wq = NULL;
725 
726 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
727 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
728 	ASSERT(dsp->ds_tx_list_head == NULL);
729 	ASSERT(dsp->ds_tx_list_tail == NULL);
730 	ASSERT(dsp->ds_tx_cnt == 0);
731 	ASSERT(dsp->ds_tx_msgcnt == 0);
732 	ASSERT(dsp->ds_tx_qdepth_tid == 0);
733 	ASSERT(!dsp->ds_tx_qbusy);
734 
735 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
736 	ASSERT(dsp->ds_pending_head == NULL);
737 	ASSERT(dsp->ds_pending_tail == NULL);
738 	ASSERT(dsp->ds_tx == NULL);
739 	ASSERT(dsp->ds_unitdata_tx == NULL);
740 
741 	/*
742 	 * Reinitialize all the flags.
743 	 */
744 	dsp->ds_notifications = 0;
745 	dsp->ds_passivestate = DLD_UNINITIALIZED;
746 	dsp->ds_mode = DLD_UNITDATA;
747 	dsp->ds_native = B_FALSE;
748 
749 	/*
750 	 * Free the dummy mblk if exists.
751 	 */
752 	if (dsp->ds_tx_flow_mp != NULL) {
753 		freeb(dsp->ds_tx_flow_mp);
754 		dsp->ds_tx_flow_mp = NULL;
755 	}
756 
757 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
758 	ASSERT(dsp == (dld_str_t *)val);
759 
760 	/*
761 	 * Free the object back to the cache.
762 	 */
763 	kmem_cache_free(str_cachep, dsp);
764 	atomic_add_32(&str_count, -1);
765 }
766 
767 /*
768  * kmem_cache contructor function: see kmem_cache_create(9f).
769  */
770 /*ARGSUSED*/
771 static int
772 str_constructor(void *buf, void *cdrarg, int kmflags)
773 {
774 	dld_str_t	*dsp = buf;
775 
776 	bzero(buf, sizeof (dld_str_t));
777 
778 	/*
779 	 * Allocate a new minor number.
780 	 */
781 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
782 		return (-1);
783 
784 	/*
785 	 * Initialize the DLPI state machine.
786 	 */
787 	dsp->ds_dlstate = DL_UNATTACHED;
788 
789 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
790 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
791 	mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL);
792 	cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL);
793 	mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL);
794 	cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL);
795 
796 	return (0);
797 }
798 
799 /*
800  * kmem_cache destructor function.
801  */
802 /*ARGSUSED*/
803 static void
804 str_destructor(void *buf, void *cdrarg)
805 {
806 	dld_str_t	*dsp = buf;
807 
808 	/*
809 	 * Make sure the DLPI state machine was reset.
810 	 */
811 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
812 
813 	/*
814 	 * Make sure the data-link interface was closed.
815 	 */
816 	ASSERT(dsp->ds_mh == NULL);
817 	ASSERT(dsp->ds_dc == NULL);
818 	ASSERT(dsp->ds_tx == NULL);
819 	ASSERT(dsp->ds_unitdata_tx == NULL);
820 	ASSERT(dsp->ds_intx_cnt == 0);
821 	ASSERT(dsp->ds_detaching == B_FALSE);
822 
823 	/*
824 	 * Make sure enabled notifications are cleared.
825 	 */
826 	ASSERT(dsp->ds_notifications == 0);
827 
828 	/*
829 	 * Make sure polling is disabled.
830 	 */
831 	ASSERT(!dsp->ds_polling);
832 
833 	/*
834 	 * Release the minor number.
835 	 */
836 	mac_minor_rele(dsp->ds_minor);
837 
838 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
839 	rw_destroy(&dsp->ds_lock);
840 
841 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
842 	mutex_destroy(&dsp->ds_tx_list_lock);
843 	ASSERT(dsp->ds_tx_flow_mp == NULL);
844 	ASSERT(dsp->ds_pending_head == NULL);
845 	ASSERT(dsp->ds_pending_tail == NULL);
846 	ASSERT(!dsp->ds_closing);
847 
848 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
849 	mutex_destroy(&dsp->ds_disp_lock);
850 	cv_destroy(&dsp->ds_disp_cv);
851 
852 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock));
853 	mutex_destroy(&dsp->ds_tx_lock);
854 	cv_destroy(&dsp->ds_tx_cv);
855 }
856 
857 void
858 dld_tx_single(dld_str_t *dsp, mblk_t *mp)
859 {
860 	/*
861 	 * If we are busy enqueue the packet and return.
862 	 * Otherwise hand them over to the MAC driver for transmission.
863 	 * If the message didn't get sent it will be queued.
864 	 *
865 	 * Note here that we don't grab the list lock prior to checking
866 	 * the busy flag.  This is okay, because a missed transition
867 	 * will not cause any packet reordering for any particular TCP
868 	 * connection (which is single-threaded).  The enqueue routine
869 	 * will atomically set the busy flag and schedule the service
870 	 * thread to run; the flag is only cleared by the service thread
871 	 * when there is no more packet to be transmitted.
872 	 */
873 
874 	if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL))
875 		dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp));
876 }
877 
878 /*
879  * Update the priority bits and VID (may need to insert tag if mp points
880  * to an untagged packet).
881  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
882  */
883 static mblk_t *
884 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
885 {
886 	mblk_t *hmp;
887 	struct ether_vlan_header *evhp;
888 	struct ether_header *ehp;
889 	uint16_t old_tci = 0;
890 	size_t len;
891 
892 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
893 
894 	evhp = (struct ether_vlan_header *)mp->b_rptr;
895 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
896 		/*
897 		 * Tagged packet, update the priority bits.
898 		 */
899 		old_tci = ntohs(evhp->ether_tci);
900 		len = sizeof (struct ether_vlan_header);
901 
902 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
903 			/*
904 			 * In case some drivers only check the db_ref
905 			 * count of the first mblk, we pullup the
906 			 * message into a single mblk.
907 			 */
908 			hmp = msgpullup(mp, -1);
909 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
910 				freemsg(hmp);
911 				return (NULL);
912 			} else {
913 				freemsg(mp);
914 				mp = hmp;
915 			}
916 		}
917 
918 		evhp = (struct ether_vlan_header *)mp->b_rptr;
919 	} else {
920 		/*
921 		 * Untagged packet. Insert the special priority tag.
922 		 * First allocate a header mblk.
923 		 */
924 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
925 		if (hmp == NULL)
926 			return (NULL);
927 
928 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
929 		ehp = (struct ether_header *)mp->b_rptr;
930 
931 		/*
932 		 * Copy the MAC addresses and typelen
933 		 */
934 		bcopy(ehp, evhp, (ETHERADDRL * 2));
935 		evhp->ether_type = ehp->ether_type;
936 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
937 
938 		hmp->b_wptr += sizeof (struct ether_vlan_header);
939 		mp->b_rptr += sizeof (struct ether_header);
940 
941 		/*
942 		 * Free the original message if it's now empty. Link the
943 		 * rest of the messages to the header message.
944 		 */
945 		if (MBLKL(mp) == 0) {
946 			hmp->b_cont = mp->b_cont;
947 			freeb(mp);
948 		} else {
949 			hmp->b_cont = mp;
950 		}
951 		mp = hmp;
952 	}
953 
954 	if (pri == 0)
955 		pri = VLAN_PRI(old_tci);
956 	if (vid == VLAN_ID_NONE)
957 		vid = VLAN_ID(old_tci);
958 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
959 	return (mp);
960 }
961 
962 /*
963  * M_DATA put
964  *
965  * The poll callback function for DLS clients which are not in the per-stream
966  * mode. This function is called from an upper layer protocol (currently only
967  * tcp and udp).
968  */
969 void
970 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
971 {
972 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
973 	mblk_t *newmp;
974 	uint_t pri;
975 
976 	if (is_ethernet) {
977 		/*
978 		 * Update the priority bits to the assigned priority.
979 		 */
980 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
981 
982 		if (pri != 0) {
983 			newmp = i_dld_ether_header_update_tag(mp, pri,
984 			    VLAN_ID_NONE);
985 			if (newmp == NULL)
986 				goto discard;
987 			mp = newmp;
988 		}
989 	}
990 
991 	dld_tx_single(dsp, mp);
992 	return;
993 
994 discard:
995 	/* TODO: bump kstat? */
996 	freemsg(mp);
997 }
998 
999 /*
1000  * M_DATA put (DLIOCRAW mode).
1001  */
1002 void
1003 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
1004 {
1005 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1006 	mblk_t *bp, *newmp;
1007 	size_t size;
1008 	mac_header_info_t mhi;
1009 	uint_t pri, vid;
1010 	uint_t max_sdu;
1011 
1012 	/*
1013 	 * Certain MAC type plugins provide an illusion for raw DLPI
1014 	 * consumers.  They pretend that the MAC layer is something that
1015 	 * it's not for the benefit of observability tools.  For example,
1016 	 * mac_wifi pretends that it's Ethernet for such consumers.
1017 	 * Here, unless native mode is enabled, we call into the MAC layer so
1018 	 * that this illusion can be maintained.  The plugin will optionally
1019 	 * transform the MAC header here into something that can be passed
1020 	 * down.  The header goes from raw mode to "cooked" mode.
1021 	 */
1022 	if (!dsp->ds_native) {
1023 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
1024 			goto discard;
1025 		mp = newmp;
1026 	}
1027 
1028 	size = MBLKL(mp);
1029 
1030 	/*
1031 	 * Check the packet is not too big and that any remaining
1032 	 * fragment list is composed entirely of M_DATA messages. (We
1033 	 * know the first fragment was M_DATA otherwise we could not
1034 	 * have got here).
1035 	 */
1036 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
1037 		if (DB_TYPE(bp) != M_DATA)
1038 			goto discard;
1039 		size += MBLKL(bp);
1040 	}
1041 
1042 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1043 		goto discard;
1044 
1045 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1046 	/*
1047 	 * If LSO is enabled, check the size against lso_max. Otherwise,
1048 	 * compare the packet size with max_sdu.
1049 	 */
1050 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
1051 	if (size > max_sdu + mhi.mhi_hdrsize)
1052 		goto discard;
1053 
1054 	if (is_ethernet) {
1055 		/*
1056 		 * Discard the packet if this is a VLAN stream but the VID in
1057 		 * the packet is not correct.
1058 		 */
1059 		vid = VLAN_ID(mhi.mhi_tci);
1060 		if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
1061 			goto discard;
1062 
1063 		/*
1064 		 * Discard the packet if this packet is a tagged packet
1065 		 * but both pri and VID are 0.
1066 		 */
1067 		pri = VLAN_PRI(mhi.mhi_tci);
1068 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
1069 			goto discard;
1070 
1071 		/*
1072 		 * Update the priority bits to the per-stream priority if
1073 		 * priority is not set in the packet. Update the VID for
1074 		 * packets on a VLAN stream.
1075 		 */
1076 		pri = (pri == 0) ? dsp->ds_pri : 0;
1077 		if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
1078 			if ((newmp = i_dld_ether_header_update_tag(mp,
1079 			    pri, dsp->ds_vid)) == NULL) {
1080 				goto discard;
1081 			}
1082 			mp = newmp;
1083 		}
1084 	}
1085 
1086 	dld_tx_single(dsp, mp);
1087 	return;
1088 
1089 discard:
1090 	/* TODO: bump kstat? */
1091 	freemsg(mp);
1092 }
1093 
1094 /*
1095  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1096  */
1097 int
1098 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1099 {
1100 	dev_t				dev;
1101 	int				err;
1102 	const char			*drvname;
1103 	dls_channel_t			dc;
1104 	uint_t				addr_length;
1105 	boolean_t			qassociated = B_FALSE;
1106 
1107 	ASSERT(dsp->ds_dc == NULL);
1108 
1109 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1110 		return (EINVAL);
1111 
1112 	/*
1113 	 * /dev node access. This will still be supported for backward
1114 	 * compatibility reason.
1115 	 */
1116 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1117 	    (strcmp(drvname, "vnic") != 0)) {
1118 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1119 			return (EINVAL);
1120 		qassociated = B_TRUE;
1121 	}
1122 
1123 	/*
1124 	 * Open a channel.
1125 	 */
1126 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) {
1127 		/*
1128 		 * style-2 VLAN open, this is a /dev VLAN ppa open
1129 		 * which might result in a newly created dls_vlan_t.
1130 		 */
1131 		err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc);
1132 		if (err != 0) {
1133 			if (qassociated)
1134 				(void) qassociate(dsp->ds_wq, -1);
1135 			return (err);
1136 		}
1137 	} else {
1138 		dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1139 		if ((err = dls_open_by_dev(dev, &dc)) != 0) {
1140 			if (qassociated)
1141 				(void) qassociate(dsp->ds_wq, -1);
1142 			return (err);
1143 		}
1144 	}
1145 
1146 	/*
1147 	 * Cache the MAC interface handle, a pointer to the immutable MAC
1148 	 * information and the current and 'factory' MAC address.
1149 	 */
1150 	dsp->ds_mh = dls_mac(dc);
1151 	dsp->ds_mip = mac_info(dsp->ds_mh);
1152 
1153 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1154 
1155 	addr_length = dsp->ds_mip->mi_addr_length;
1156 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
1157 
1158 	/*
1159 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
1160 	 * a non-VLAN interface).
1161 	 */
1162 	dsp->ds_vid = dls_vid(dc);
1163 
1164 	/*
1165 	 * Set the default packet priority.
1166 	 */
1167 	dsp->ds_pri = 0;
1168 
1169 	/*
1170 	 * Add a notify function so that the we get updates from the MAC.
1171 	 */
1172 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
1173 
1174 	dsp->ds_dc = dc;
1175 	dsp->ds_dlstate = DL_UNBOUND;
1176 
1177 	return (0);
1178 }
1179 
1180 /*
1181  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1182  * from close(2) for style 2.
1183  */
1184 void
1185 dld_str_detach(dld_str_t *dsp)
1186 {
1187 	/*
1188 	 * Remove the notify function.
1189 	 */
1190 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
1191 
1192 	/*
1193 	 * Disable the capabilities and clear the promisc flag.
1194 	 */
1195 	ASSERT(!dsp->ds_polling);
1196 	ASSERT(!dsp->ds_soft_ring);
1197 	dld_capabilities_disable(dsp);
1198 	dsp->ds_promisc = 0;
1199 
1200 	DLD_TX_QUIESCE(dsp);
1201 
1202 	/*
1203 	 * Flush all pending packets which are sitting in the transmit queue.
1204 	 */
1205 	dld_tx_flush(dsp);
1206 
1207 	/*
1208 	 * Clear LSO flags.
1209 	 */
1210 	dsp->ds_lso = B_FALSE;
1211 	dsp->ds_lso_max = 0;
1212 
1213 	dls_close(dsp->ds_dc);
1214 	dsp->ds_dc = NULL;
1215 	dsp->ds_mh = NULL;
1216 
1217 	if (dsp->ds_style == DL_STYLE2)
1218 		(void) qassociate(dsp->ds_wq, -1);
1219 
1220 	/*
1221 	 * Re-initialize the DLPI state machine.
1222 	 */
1223 	dsp->ds_dlstate = DL_UNATTACHED;
1224 
1225 }
1226 
1227 /*
1228  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1229  * tags before sending packets up to the DLS clients, with the exception of
1230  * special priority tagged packets, in that case, we set the VID to 0.
1231  * mp must be a VLAN tagged packet.
1232  */
1233 static mblk_t *
1234 i_dld_ether_header_strip_tag(mblk_t *mp)
1235 {
1236 	mblk_t *newmp;
1237 	struct ether_vlan_header *evhp;
1238 	uint16_t tci, new_tci;
1239 
1240 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1241 	if (DB_REF(mp) > 1) {
1242 		newmp = copymsg(mp);
1243 		if (newmp == NULL)
1244 			return (NULL);
1245 		freemsg(mp);
1246 		mp = newmp;
1247 	}
1248 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1249 
1250 	tci = ntohs(evhp->ether_tci);
1251 	if (VLAN_PRI(tci) == 0) {
1252 		/*
1253 		 * Priority is 0, strip the tag.
1254 		 */
1255 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1256 		mp->b_rptr += VLAN_TAGSZ;
1257 	} else {
1258 		/*
1259 		 * Priority is not 0, update the VID to 0.
1260 		 */
1261 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1262 		evhp->ether_tci = htons(new_tci);
1263 	}
1264 	return (mp);
1265 }
1266 
1267 /*
1268  * Raw mode receive function.
1269  */
1270 /*ARGSUSED*/
1271 void
1272 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1273     mac_header_info_t *mhip)
1274 {
1275 	dld_str_t *dsp = (dld_str_t *)arg;
1276 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1277 	mblk_t *next, *newmp;
1278 
1279 	ASSERT(mp != NULL);
1280 	do {
1281 		/*
1282 		 * Get the pointer to the next packet in the chain and then
1283 		 * clear b_next before the packet gets passed on.
1284 		 */
1285 		next = mp->b_next;
1286 		mp->b_next = NULL;
1287 
1288 		/*
1289 		 * Wind back b_rptr to point at the MAC header.
1290 		 */
1291 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1292 		mp->b_rptr -= mhip->mhi_hdrsize;
1293 
1294 		/*
1295 		 * Certain MAC type plugins provide an illusion for raw
1296 		 * DLPI consumers.  They pretend that the MAC layer is
1297 		 * something that it's not for the benefit of observability
1298 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1299 		 * for such consumers.	Here, unless native mode is enabled,
1300 		 * we call into the MAC layer so that this illusion can be
1301 		 * maintained.	The plugin will optionally transform the MAC
1302 		 * header here into something that can be passed up to raw
1303 		 * consumers.  The header goes from "cooked" mode to raw mode.
1304 		 */
1305 		if (!dsp->ds_native) {
1306 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1307 			if (newmp == NULL) {
1308 				freemsg(mp);
1309 				goto next;
1310 			}
1311 			mp = newmp;
1312 		}
1313 
1314 		/*
1315 		 * Strip the VLAN tag for VLAN streams.
1316 		 */
1317 		if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
1318 			newmp = i_dld_ether_header_strip_tag(mp);
1319 			if (newmp == NULL) {
1320 				freemsg(mp);
1321 				goto next;
1322 			}
1323 			mp = newmp;
1324 		}
1325 
1326 		/*
1327 		 * Pass the packet on.
1328 		 */
1329 		if (canputnext(dsp->ds_rq))
1330 			putnext(dsp->ds_rq, mp);
1331 		else
1332 			freemsg(mp);
1333 
1334 next:
1335 		/*
1336 		 * Move on to the next packet in the chain.
1337 		 */
1338 		mp = next;
1339 	} while (mp != NULL);
1340 }
1341 
1342 /*
1343  * Fast-path receive function.
1344  */
1345 /*ARGSUSED*/
1346 void
1347 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1348     mac_header_info_t *mhip)
1349 {
1350 	dld_str_t *dsp = (dld_str_t *)arg;
1351 	mblk_t *next;
1352 	size_t offset = 0;
1353 
1354 	/*
1355 	 * MAC header stripping rules:
1356 	 *    - Tagged packets:
1357 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1358 	 *	b. Physical streams
1359 	 *	- VLAN packets (non-zero VID). The stream must be either a
1360 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1361 	 *	  Strip the Ethernet header but keep the VLAN header.
1362 	 *	- Special tagged packets (zero VID)
1363 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1364 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1365 	 *	    keep the VLAN header.
1366 	 *	  * Otherwise, strip the whole VLAN header.
1367 	 *    - Untagged packets. Strip the whole MAC header.
1368 	 */
1369 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1370 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1371 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1372 		offset = VLAN_TAGSZ;
1373 	}
1374 
1375 	ASSERT(mp != NULL);
1376 	do {
1377 		/*
1378 		 * Get the pointer to the next packet in the chain and then
1379 		 * clear b_next before the packet gets passed on.
1380 		 */
1381 		next = mp->b_next;
1382 		mp->b_next = NULL;
1383 
1384 		/*
1385 		 * Wind back b_rptr to point at the VLAN header.
1386 		 */
1387 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1388 		mp->b_rptr -= offset;
1389 
1390 		/*
1391 		 * Pass the packet on.
1392 		 */
1393 		if (canputnext(dsp->ds_rq))
1394 			putnext(dsp->ds_rq, mp);
1395 		else
1396 			freemsg(mp);
1397 		/*
1398 		 * Move on to the next packet in the chain.
1399 		 */
1400 		mp = next;
1401 	} while (mp != NULL);
1402 }
1403 
1404 /*
1405  * Default receive function (send DL_UNITDATA_IND messages).
1406  */
1407 /*ARGSUSED*/
1408 void
1409 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1410     mac_header_info_t *mhip)
1411 {
1412 	dld_str_t		*dsp = (dld_str_t *)arg;
1413 	mblk_t			*ud_mp;
1414 	mblk_t			*next;
1415 	size_t			offset = 0;
1416 	boolean_t		strip_vlan = B_TRUE;
1417 
1418 	/*
1419 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1420 	 */
1421 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1422 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1423 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1424 		offset = VLAN_TAGSZ;
1425 		strip_vlan = B_FALSE;
1426 	}
1427 
1428 	ASSERT(mp != NULL);
1429 	do {
1430 		/*
1431 		 * Get the pointer to the next packet in the chain and then
1432 		 * clear b_next before the packet gets passed on.
1433 		 */
1434 		next = mp->b_next;
1435 		mp->b_next = NULL;
1436 
1437 		/*
1438 		 * Wind back b_rptr to point at the MAC header.
1439 		 */
1440 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1441 		mp->b_rptr -= mhip->mhi_hdrsize;
1442 
1443 		/*
1444 		 * Create the DL_UNITDATA_IND M_PROTO.
1445 		 */
1446 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1447 			freemsgchain(mp);
1448 			return;
1449 		}
1450 
1451 		/*
1452 		 * Advance b_rptr to point at the payload (or the VLAN header).
1453 		 */
1454 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1455 
1456 		/*
1457 		 * Prepend the DL_UNITDATA_IND.
1458 		 */
1459 		ud_mp->b_cont = mp;
1460 
1461 		/*
1462 		 * Send the message.
1463 		 */
1464 		if (canputnext(dsp->ds_rq))
1465 			putnext(dsp->ds_rq, ud_mp);
1466 		else
1467 			freemsg(ud_mp);
1468 
1469 		/*
1470 		 * Move on to the next packet in the chain.
1471 		 */
1472 		mp = next;
1473 	} while (mp != NULL);
1474 }
1475 
1476 /*
1477  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1478  */
1479 static void
1480 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1481 {
1482 	mblk_t		*mp;
1483 	dl_notify_ind_t *dlip;
1484 
1485 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1486 		return;
1487 
1488 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1489 	    M_PROTO, 0)) == NULL)
1490 		return;
1491 
1492 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1493 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1494 	dlip->dl_primitive = DL_NOTIFY_IND;
1495 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1496 	dlip->dl_data = max_sdu;
1497 
1498 	qreply(dsp->ds_wq, mp);
1499 }
1500 
1501 /*
1502  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1503  * current state of the interface.
1504  */
1505 void
1506 dld_str_notify_ind(dld_str_t *dsp)
1507 {
1508 	mac_notify_type_t	type;
1509 
1510 	for (type = 0; type < MAC_NNOTE; type++)
1511 		str_notify(dsp, type);
1512 }
1513 
1514 typedef struct dl_unitdata_ind_wrapper {
1515 	dl_unitdata_ind_t	dl_unitdata;
1516 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1517 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1518 } dl_unitdata_ind_wrapper_t;
1519 
1520 /*
1521  * Create a DL_UNITDATA_IND M_PROTO message.
1522  */
1523 static mblk_t *
1524 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1525 {
1526 	mblk_t				*nmp;
1527 	dl_unitdata_ind_wrapper_t	*dlwp;
1528 	dl_unitdata_ind_t		*dlp;
1529 	mac_header_info_t		mhi;
1530 	uint_t				addr_length;
1531 	uint8_t				*daddr;
1532 	uint8_t				*saddr;
1533 
1534 	/*
1535 	 * Get the packet header information.
1536 	 */
1537 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1538 		return (NULL);
1539 
1540 	/*
1541 	 * Allocate a message large enough to contain the wrapper structure
1542 	 * defined above.
1543 	 */
1544 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1545 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1546 	    DL_UNITDATA_IND)) == NULL)
1547 		return (NULL);
1548 
1549 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1550 
1551 	dlp = &(dlwp->dl_unitdata);
1552 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1553 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1554 
1555 	/*
1556 	 * Copy in the destination address.
1557 	 */
1558 	addr_length = dsp->ds_mip->mi_addr_length;
1559 	daddr = dlwp->dl_dest_addr;
1560 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1561 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1562 
1563 	/*
1564 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1565 	 */
1566 	if (mhi.mhi_istagged && !strip_vlan)
1567 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1568 	else
1569 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1570 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1571 
1572 	/*
1573 	 * If the destination address was multicast or broadcast then the
1574 	 * dl_group_address field should be non-zero.
1575 	 */
1576 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1577 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1578 
1579 	/*
1580 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1581 	 * for example) may not have access to source information.
1582 	 */
1583 	if (mhi.mhi_saddr == NULL) {
1584 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1585 	} else {
1586 		saddr = dlwp->dl_src_addr;
1587 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1588 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1589 
1590 		/*
1591 		 * Set the source DLSAP to the packet ethertype.
1592 		 */
1593 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1594 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1595 	}
1596 
1597 	return (nmp);
1598 }
1599 
1600 /*
1601  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1602  */
1603 static void
1604 str_notify_promisc_on_phys(dld_str_t *dsp)
1605 {
1606 	mblk_t		*mp;
1607 	dl_notify_ind_t	*dlip;
1608 
1609 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1610 		return;
1611 
1612 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1613 	    M_PROTO, 0)) == NULL)
1614 		return;
1615 
1616 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1617 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1618 	dlip->dl_primitive = DL_NOTIFY_IND;
1619 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1620 
1621 	qreply(dsp->ds_wq, mp);
1622 }
1623 
1624 /*
1625  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1626  */
1627 static void
1628 str_notify_promisc_off_phys(dld_str_t *dsp)
1629 {
1630 	mblk_t		*mp;
1631 	dl_notify_ind_t	*dlip;
1632 
1633 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1634 		return;
1635 
1636 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1637 	    M_PROTO, 0)) == NULL)
1638 		return;
1639 
1640 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1641 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1642 	dlip->dl_primitive = DL_NOTIFY_IND;
1643 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1644 
1645 	qreply(dsp->ds_wq, mp);
1646 }
1647 
1648 /*
1649  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1650  */
1651 static void
1652 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1653 {
1654 	mblk_t		*mp;
1655 	dl_notify_ind_t	*dlip;
1656 	uint_t		addr_length;
1657 	uint16_t	ethertype;
1658 
1659 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1660 		return;
1661 
1662 	addr_length = dsp->ds_mip->mi_addr_length;
1663 	if ((mp = mexchange(dsp->ds_wq, NULL,
1664 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1665 	    M_PROTO, 0)) == NULL)
1666 		return;
1667 
1668 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1669 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1670 	dlip->dl_primitive = DL_NOTIFY_IND;
1671 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1672 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1673 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1674 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1675 
1676 	bcopy(addr, &dlip[1], addr_length);
1677 
1678 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1679 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1680 
1681 	qreply(dsp->ds_wq, mp);
1682 }
1683 
1684 /*
1685  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1686  */
1687 static void
1688 str_notify_link_up(dld_str_t *dsp)
1689 {
1690 	mblk_t		*mp;
1691 	dl_notify_ind_t	*dlip;
1692 
1693 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1694 		return;
1695 
1696 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1697 	    M_PROTO, 0)) == NULL)
1698 		return;
1699 
1700 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1701 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1702 	dlip->dl_primitive = DL_NOTIFY_IND;
1703 	dlip->dl_notification = DL_NOTE_LINK_UP;
1704 
1705 	qreply(dsp->ds_wq, mp);
1706 }
1707 
1708 /*
1709  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1710  */
1711 static void
1712 str_notify_link_down(dld_str_t *dsp)
1713 {
1714 	mblk_t		*mp;
1715 	dl_notify_ind_t	*dlip;
1716 
1717 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1718 		return;
1719 
1720 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1721 	    M_PROTO, 0)) == NULL)
1722 		return;
1723 
1724 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1725 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1726 	dlip->dl_primitive = DL_NOTIFY_IND;
1727 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1728 
1729 	qreply(dsp->ds_wq, mp);
1730 }
1731 
1732 /*
1733  * DL_NOTIFY_IND: DL_NOTE_SPEED
1734  */
1735 static void
1736 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1737 {
1738 	mblk_t		*mp;
1739 	dl_notify_ind_t	*dlip;
1740 
1741 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1742 		return;
1743 
1744 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1745 	    M_PROTO, 0)) == NULL)
1746 		return;
1747 
1748 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1749 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1750 	dlip->dl_primitive = DL_NOTIFY_IND;
1751 	dlip->dl_notification = DL_NOTE_SPEED;
1752 	dlip->dl_data = speed;
1753 
1754 	qreply(dsp->ds_wq, mp);
1755 }
1756 
1757 /*
1758  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1759  */
1760 static void
1761 str_notify_capab_reneg(dld_str_t *dsp)
1762 {
1763 	mblk_t		*mp;
1764 	dl_notify_ind_t	*dlip;
1765 
1766 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1767 		return;
1768 
1769 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1770 	    M_PROTO, 0)) == NULL)
1771 		return;
1772 
1773 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1774 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1775 	dlip->dl_primitive = DL_NOTIFY_IND;
1776 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1777 
1778 	qreply(dsp->ds_wq, mp);
1779 }
1780 
1781 /*
1782  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1783  */
1784 static void
1785 str_notify_fastpath_flush(dld_str_t *dsp)
1786 {
1787 	mblk_t		*mp;
1788 	dl_notify_ind_t	*dlip;
1789 
1790 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1791 		return;
1792 
1793 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1794 	    M_PROTO, 0)) == NULL)
1795 		return;
1796 
1797 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1798 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1799 	dlip->dl_primitive = DL_NOTIFY_IND;
1800 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1801 
1802 	qreply(dsp->ds_wq, mp);
1803 }
1804 
1805 /*
1806  * MAC notification callback.
1807  */
1808 static void
1809 str_notify(void *arg, mac_notify_type_t type)
1810 {
1811 	dld_str_t		*dsp = (dld_str_t *)arg;
1812 	queue_t			*q = dsp->ds_wq;
1813 
1814 	switch (type) {
1815 	case MAC_NOTE_TX:
1816 		qenable(q);
1817 		break;
1818 
1819 	case MAC_NOTE_DEVPROMISC:
1820 		/*
1821 		 * Send the appropriate DL_NOTIFY_IND.
1822 		 */
1823 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1824 			str_notify_promisc_on_phys(dsp);
1825 		else
1826 			str_notify_promisc_off_phys(dsp);
1827 		break;
1828 
1829 	case MAC_NOTE_PROMISC:
1830 		break;
1831 
1832 	case MAC_NOTE_UNICST:
1833 		/*
1834 		 * This notification is sent whenever the MAC unicast address
1835 		 * changes. We need to re-cache the address.
1836 		 */
1837 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1838 
1839 		/*
1840 		 * Send the appropriate DL_NOTIFY_IND.
1841 		 */
1842 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1843 		break;
1844 
1845 	case MAC_NOTE_LINK:
1846 		/*
1847 		 * This notification is sent every time the MAC driver
1848 		 * updates the link state.
1849 		 */
1850 		switch (mac_link_get(dsp->ds_mh)) {
1851 		case LINK_STATE_UP: {
1852 			uint64_t speed;
1853 			/*
1854 			 * The link is up so send the appropriate
1855 			 * DL_NOTIFY_IND.
1856 			 */
1857 			str_notify_link_up(dsp);
1858 
1859 			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
1860 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1861 			break;
1862 		}
1863 		case LINK_STATE_DOWN:
1864 			/*
1865 			 * The link is down so send the appropriate
1866 			 * DL_NOTIFY_IND.
1867 			 */
1868 			str_notify_link_down(dsp);
1869 			break;
1870 
1871 		default:
1872 			break;
1873 		}
1874 		break;
1875 
1876 	case MAC_NOTE_RESOURCE:
1877 	case MAC_NOTE_VNIC:
1878 		/*
1879 		 * This notification is sent whenever the MAC resources
1880 		 * change or capabilities change. We need to renegotiate
1881 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1882 		 */
1883 		str_notify_capab_reneg(dsp);
1884 		break;
1885 
1886 	case MAC_NOTE_SDU_SIZE: {
1887 		uint_t  max_sdu;
1888 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1889 		str_notify_sdu_size(dsp, max_sdu);
1890 		break;
1891 	}
1892 
1893 	case MAC_NOTE_FASTPATH_FLUSH:
1894 		str_notify_fastpath_flush(dsp);
1895 		break;
1896 
1897 	case MAC_NOTE_MARGIN:
1898 		break;
1899 
1900 	default:
1901 		ASSERT(B_FALSE);
1902 		break;
1903 	}
1904 }
1905 
1906 static inline uint_t
1907 mp_getsize(mblk_t *mp)
1908 {
1909 	ASSERT(DB_TYPE(mp) == M_DATA);
1910 	return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp));
1911 }
1912 
1913 /*
1914  * Calculate the dld queue depth, free the messages that exceed the threshold.
1915  */
1916 static void
1917 dld_tx_qdepth_timer(void *arg)
1918 {
1919 	dld_str_t *dsp = (dld_str_t *)arg;
1920 	mblk_t *prev, *mp;
1921 	uint_t cnt, msgcnt, size;
1922 
1923 	mutex_enter(&dsp->ds_tx_list_lock);
1924 
1925 	/* Calculate total size and count of the packet(s) */
1926 	cnt = msgcnt = 0;
1927 	for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL;
1928 	    prev = mp, mp = mp->b_next) {
1929 		size = mp_getsize(mp);
1930 		cnt += size;
1931 		msgcnt++;
1932 		if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) {
1933 			ASSERT(dsp->ds_tx_qbusy);
1934 			dsp->ds_tx_list_tail = prev;
1935 			if (prev == NULL)
1936 				dsp->ds_tx_list_head = NULL;
1937 			else
1938 				prev->b_next = NULL;
1939 			freemsgchain(mp);
1940 			cnt -= size;
1941 			msgcnt--;
1942 			break;
1943 		}
1944 	}
1945 	dsp->ds_tx_cnt = cnt;
1946 	dsp->ds_tx_msgcnt = msgcnt;
1947 	dsp->ds_tx_qdepth_tid = 0;
1948 	mutex_exit(&dsp->ds_tx_list_lock);
1949 }
1950 
1951 /*
1952  * Enqueue one or more messages on the transmit queue. Caller specifies:
1953  *  - the insertion position (head/tail).
1954  *  - the message count and the total message size of messages to be queued
1955  *    if they are known to the caller; or 0 if they are not known.
1956  *
1957  * If the caller does not know the message size information, this usually
1958  * means that dld_wsrv() managed to send some but not all of the queued
1959  * messages. For performance reasons, we do not calculate the queue depth
1960  * every time. Instead, a timer is started to calculate the queue depth
1961  * every 1 second (can be changed by tx_qdepth_interval).
1962  */
1963 static void
1964 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert,
1965     uint_t msgcnt, uint_t cnt)
1966 {
1967 	queue_t *q = dsp->ds_wq;
1968 	uint_t tot_cnt, tot_msgcnt;
1969 	mblk_t *next;
1970 
1971 	mutex_enter(&dsp->ds_tx_list_lock);
1972 
1973 	/*
1974 	 * Simply enqueue the message and calculate the queue depth via
1975 	 * timer if:
1976 	 *
1977 	 * - the current queue depth is incorrect, and the timer is already
1978 	 *   started; or
1979 	 *
1980 	 * - the given message size is unknown and it is allowed to start the
1981 	 *   timer;
1982 	 */
1983 	if ((dsp->ds_tx_qdepth_tid != 0) ||
1984 	    (msgcnt == 0 && tx_qdepth_interval != 0)) {
1985 		goto enqueue;
1986 	}
1987 
1988 	/*
1989 	 * The timer is not allowed, so calculate the message size now.
1990 	 */
1991 	if (msgcnt == 0) {
1992 		for (next = mp; next != NULL; next = next->b_next) {
1993 			cnt += mp_getsize(next);
1994 			msgcnt++;
1995 		}
1996 	}
1997 
1998 	/*
1999 	 * Grow the queue depth using the input messesge size.
2000 	 *
2001 	 * If the queue depth would exceed the allowed threshold, drop
2002 	 * new packet(s) and drain those already in the queue.
2003 	 */
2004 	tot_cnt = dsp->ds_tx_cnt + cnt;
2005 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
2006 
2007 	if (!head_insert && (tot_cnt >= dld_max_q_count ||
2008 	    tot_msgcnt >= dld_max_q_count)) {
2009 		ASSERT(dsp->ds_tx_qbusy);
2010 		mutex_exit(&dsp->ds_tx_list_lock);
2011 		freemsgchain(mp);
2012 		goto done;
2013 	}
2014 	/* Update the queue size parameters */
2015 	dsp->ds_tx_cnt = tot_cnt;
2016 	dsp->ds_tx_msgcnt = tot_msgcnt;
2017 
2018 enqueue:
2019 	/*
2020 	 * If the transmit queue is currently empty and we are
2021 	 * about to deposit the packet(s) there, switch mode to
2022 	 * "busy" and raise flow-control condition.
2023 	 */
2024 	if (!dsp->ds_tx_qbusy) {
2025 		dsp->ds_tx_qbusy = B_TRUE;
2026 		ASSERT(dsp->ds_tx_flow_mp != NULL);
2027 		(void) putq(q, dsp->ds_tx_flow_mp);
2028 		dsp->ds_tx_flow_mp = NULL;
2029 	}
2030 
2031 	if (!head_insert) {
2032 		/* Tail insertion */
2033 		if (dsp->ds_tx_list_head == NULL)
2034 			dsp->ds_tx_list_head = mp;
2035 		else
2036 			dsp->ds_tx_list_tail->b_next = mp;
2037 		dsp->ds_tx_list_tail = tail;
2038 	} else {
2039 		/* Head insertion */
2040 		tail->b_next = dsp->ds_tx_list_head;
2041 		if (dsp->ds_tx_list_head == NULL)
2042 			dsp->ds_tx_list_tail = tail;
2043 		dsp->ds_tx_list_head = mp;
2044 	}
2045 
2046 	if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 &&
2047 	    tx_qdepth_interval != 0) {
2048 		/*
2049 		 * The message size is not given so that we need to start
2050 		 * the timer to calculate the queue depth.
2051 		 */
2052 		dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp,
2053 		    drv_usectohz(tx_qdepth_interval));
2054 		ASSERT(dsp->ds_tx_qdepth_tid != NULL);
2055 	}
2056 	mutex_exit(&dsp->ds_tx_list_lock);
2057 done:
2058 	/* Schedule service thread to drain the transmit queue */
2059 	if (!head_insert)
2060 		qenable(q);
2061 }
2062 
2063 void
2064 dld_tx_flush(dld_str_t *dsp)
2065 {
2066 	timeout_id_t	tid = 0;
2067 
2068 	mutex_enter(&dsp->ds_tx_list_lock);
2069 	if (dsp->ds_tx_list_head != NULL) {
2070 		freemsgchain(dsp->ds_tx_list_head);
2071 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
2072 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
2073 		if (dsp->ds_tx_qbusy) {
2074 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
2075 			ASSERT(dsp->ds_tx_flow_mp != NULL);
2076 			dsp->ds_tx_qbusy = B_FALSE;
2077 		}
2078 		if ((tid = dsp->ds_tx_qdepth_tid) != 0)
2079 			dsp->ds_tx_qdepth_tid = 0;
2080 	}
2081 	mutex_exit(&dsp->ds_tx_list_lock);
2082 
2083 	/*
2084 	 * Note that ds_tx_list_lock (which is acquired by the timeout
2085 	 * callback routine) cannot be held across the call to untimeout().
2086 	 */
2087 	if (tid != 0)
2088 		(void) untimeout(tid);
2089 }
2090 
2091 /*
2092  * Process a non-data message.
2093  */
2094 static void
2095 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
2096 {
2097 	ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) ||
2098 	    (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL));
2099 
2100 	mutex_enter(&dsp->ds_disp_lock);
2101 
2102 	/*
2103 	 * The processing of the message might block. Enqueue the
2104 	 * message for later processing.
2105 	 */
2106 	if (dsp->ds_pending_head == NULL) {
2107 		dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2108 	} else {
2109 		dsp->ds_pending_tail->b_next = mp;
2110 		dsp->ds_pending_tail = mp;
2111 	}
2112 
2113 	/*
2114 	 * If there is no task pending, kick off the task.
2115 	 */
2116 	if (dsp->ds_tid == NULL) {
2117 		dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
2118 		    dld_wput_nondata_task, dsp, TQ_SLEEP);
2119 		ASSERT(dsp->ds_tid != NULL);
2120 	}
2121 	mutex_exit(&dsp->ds_disp_lock);
2122 }
2123 
2124 /*
2125  * The worker thread which processes non-data messages. Note we only process
2126  * one message at one time in order to be able to "flush" the queued message
2127  * and serialize the processing.
2128  */
2129 static void
2130 dld_wput_nondata_task(void *arg)
2131 {
2132 	dld_str_t	*dsp = (dld_str_t *)arg;
2133 	mblk_t		*mp;
2134 
2135 	mutex_enter(&dsp->ds_disp_lock);
2136 	ASSERT(dsp->ds_pending_head != NULL);
2137 	ASSERT(dsp->ds_tid != NULL);
2138 
2139 	if (dsp->ds_closing)
2140 		goto closing;
2141 
2142 	mp = dsp->ds_pending_head;
2143 	if ((dsp->ds_pending_head = mp->b_next) == NULL)
2144 		dsp->ds_pending_tail = NULL;
2145 	mp->b_next = NULL;
2146 
2147 	mutex_exit(&dsp->ds_disp_lock);
2148 
2149 	switch (DB_TYPE(mp)) {
2150 	case M_PROTO:
2151 	case M_PCPROTO:
2152 		ASSERT(dsp->ds_type == DLD_DLPI);
2153 		dld_wput_proto_nondata(dsp, mp);
2154 		break;
2155 	case M_IOCTL: {
2156 		uint_t cmd;
2157 
2158 		if (dsp->ds_type == DLD_CONTROL) {
2159 			ASSERT(dsp->ds_ioctl != NULL);
2160 			dsp->ds_ioctl(dsp->ds_wq, mp);
2161 			break;
2162 		}
2163 
2164 		cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2165 
2166 		switch (cmd) {
2167 		case DLIOCNATIVE:
2168 			ioc_native(dsp, mp);
2169 			break;
2170 		case DLIOCMARGININFO:
2171 			ioc_margin(dsp, mp);
2172 			break;
2173 		case DLIOCRAW:
2174 			ioc_raw(dsp, mp);
2175 			break;
2176 		case DLIOCHDRINFO:
2177 			ioc_fast(dsp, mp);
2178 			break;
2179 		default:
2180 			ioc(dsp, mp);
2181 			break;
2182 		}
2183 		break;
2184 	}
2185 	case M_IOCDATA:
2186 		ASSERT(dsp->ds_type == DLD_DLPI);
2187 		ioc(dsp, mp);
2188 		break;
2189 	}
2190 
2191 	mutex_enter(&dsp->ds_disp_lock);
2192 
2193 	if (dsp->ds_closing)
2194 		goto closing;
2195 
2196 	if (dsp->ds_pending_head != NULL) {
2197 		dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
2198 		    dld_wput_nondata_task, dsp, TQ_SLEEP);
2199 		ASSERT(dsp->ds_tid != NULL);
2200 	} else {
2201 		dsp->ds_tid = NULL;
2202 	}
2203 	mutex_exit(&dsp->ds_disp_lock);
2204 	return;
2205 
2206 	/*
2207 	 * If the stream is closing, flush all queued messages and inform
2208 	 * the stream once it is done.
2209 	 */
2210 closing:
2211 	freemsgchain(dsp->ds_pending_head);
2212 	dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
2213 	dsp->ds_tid = NULL;
2214 	cv_signal(&dsp->ds_disp_cv);
2215 	mutex_exit(&dsp->ds_disp_lock);
2216 }
2217 
2218 /*
2219  * Flush queued non-data messages.
2220  */
2221 static void
2222 dld_flush_nondata(dld_str_t *dsp)
2223 {
2224 	mutex_enter(&dsp->ds_disp_lock);
2225 	freemsgchain(dsp->ds_pending_head);
2226 	dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
2227 	mutex_exit(&dsp->ds_disp_lock);
2228 }
2229 
2230 /*
2231  * DLIOCNATIVE
2232  */
2233 static void
2234 ioc_native(dld_str_t *dsp, mblk_t *mp)
2235 {
2236 	queue_t *q = dsp->ds_wq;
2237 	const mac_info_t *mip = dsp->ds_mip;
2238 
2239 	rw_enter(&dsp->ds_lock, RW_WRITER);
2240 
2241 	/*
2242 	 * Native mode can be enabled if it's disabled and if the
2243 	 * native media type is different.
2244 	 */
2245 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2246 		dsp->ds_native = B_TRUE;
2247 
2248 	rw_exit(&dsp->ds_lock);
2249 
2250 	if (dsp->ds_native)
2251 		miocack(q, mp, 0, mip->mi_nativemedia);
2252 	else
2253 		miocnak(q, mp, 0, ENOTSUP);
2254 }
2255 
2256 /*
2257  * DLIOCMARGININFO
2258  */
2259 static void
2260 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2261 {
2262 	queue_t *q = dsp->ds_wq;
2263 	uint32_t margin;
2264 	int err;
2265 
2266 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2267 		err = EINVAL;
2268 		goto failed;
2269 	}
2270 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2271 		goto failed;
2272 
2273 	mac_margin_get(dsp->ds_mh, &margin);
2274 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2275 	miocack(q, mp, sizeof (uint32_t), 0);
2276 	return;
2277 
2278 failed:
2279 	miocnak(q, mp, 0, err);
2280 }
2281 
2282 /*
2283  * DLIOCRAW
2284  */
2285 static void
2286 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2287 {
2288 	queue_t *q = dsp->ds_wq;
2289 
2290 	if (dsp->ds_polling || dsp->ds_soft_ring) {
2291 		miocnak(q, mp, 0, EPROTO);
2292 		return;
2293 	}
2294 
2295 	rw_enter(&dsp->ds_lock, RW_WRITER);
2296 	if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) {
2297 		/*
2298 		 * Set the receive callback.
2299 		 */
2300 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp);
2301 		dsp->ds_tx = str_mdata_raw_put;
2302 	}
2303 	dsp->ds_mode = DLD_RAW;
2304 	rw_exit(&dsp->ds_lock);
2305 	miocack(q, mp, 0, 0);
2306 }
2307 
2308 /*
2309  * DLIOCHDRINFO
2310  */
2311 static void
2312 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2313 {
2314 	dl_unitdata_req_t *dlp;
2315 	off_t		off;
2316 	size_t		len;
2317 	const uint8_t	*addr;
2318 	uint16_t	sap;
2319 	mblk_t		*nmp;
2320 	mblk_t		*hmp;
2321 	uint_t		addr_length;
2322 	queue_t		*q = dsp->ds_wq;
2323 	int		err;
2324 
2325 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2326 		err = ENOTSUP;
2327 		goto failed;
2328 	}
2329 
2330 	/*
2331 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2332 	 * user-land should not be allowed.
2333 	 */
2334 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2335 		err = EINVAL;
2336 		goto failed;
2337 	}
2338 
2339 	nmp = mp->b_cont;
2340 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2341 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2342 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2343 		err = EINVAL;
2344 		goto failed;
2345 	}
2346 
2347 	off = dlp->dl_dest_addr_offset;
2348 	len = dlp->dl_dest_addr_length;
2349 
2350 	if (!MBLKIN(nmp, off, len)) {
2351 		err = EINVAL;
2352 		goto failed;
2353 	}
2354 
2355 	/*
2356 	 * We don't need to hold any locks to access ds_dlstate, because
2357 	 * control message prossessing (which updates this field) is
2358 	 * serialized.
2359 	 */
2360 	if (dsp->ds_dlstate != DL_IDLE) {
2361 		err = ENOTSUP;
2362 		goto failed;
2363 	}
2364 
2365 	addr_length = dsp->ds_mip->mi_addr_length;
2366 	if (len != addr_length + sizeof (uint16_t)) {
2367 		err = EINVAL;
2368 		goto failed;
2369 	}
2370 
2371 	addr = nmp->b_rptr + off;
2372 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2373 
2374 	if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) {
2375 		err = ENOMEM;
2376 		goto failed;
2377 	}
2378 
2379 	rw_enter(&dsp->ds_lock, RW_WRITER);
2380 	ASSERT(dsp->ds_dlstate == DL_IDLE);
2381 	if (dsp->ds_mode != DLD_FASTPATH) {
2382 		/*
2383 		 * Set the receive callback (unless polling or
2384 		 * soft-ring is enabled).
2385 		 */
2386 		dsp->ds_mode = DLD_FASTPATH;
2387 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
2388 			dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp);
2389 		dsp->ds_tx = str_mdata_fastpath_put;
2390 	}
2391 	rw_exit(&dsp->ds_lock);
2392 
2393 	freemsg(nmp->b_cont);
2394 	nmp->b_cont = hmp;
2395 
2396 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2397 	return;
2398 failed:
2399 	miocnak(q, mp, 0, err);
2400 }
2401 
2402 static void
2403 ioc(dld_str_t *dsp, mblk_t *mp)
2404 {
2405 	queue_t	*q = dsp->ds_wq;
2406 	mac_handle_t mh;
2407 
2408 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2409 		miocnak(q, mp, 0, EINVAL);
2410 		return;
2411 	}
2412 	mh = dsp->ds_mh;
2413 	ASSERT(mh != NULL);
2414 	mac_ioctl(mh, q, mp);
2415 }
2416