xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_ldc.c (revision ef287aad)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/callb.h>
72 #include <sys/vlan.h>
73 
74 /* Port add/deletion/etc routines */
75 static	void vsw_port_delete(vsw_port_t *port);
76 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
77 static	void vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
78 static	int vsw_init_ldcs(vsw_port_t *port);
79 static	void vsw_uninit_ldcs(vsw_port_t *port);
80 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
81 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
82 static	void vsw_drain_ldcs(vsw_port_t *port);
83 static	void vsw_drain_port_taskq(vsw_port_t *port);
84 static	void vsw_marker_task(void *);
85 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
86 void vsw_detach_ports(vsw_t *vswp);
87 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
88 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
89 int vsw_port_detach(vsw_t *vswp, int p_instance);
90 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
91 int vsw_port_attach(vsw_port_t *portp);
92 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
93 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
94 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
95 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
96 void vsw_reset_ports(vsw_t *vswp);
97 void vsw_port_reset(vsw_port_t *portp);
98 
99 /* Interrupt routines */
100 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
101 
102 /* Handshake routines */
103 static	void vsw_ldc_reinit(vsw_ldc_t *);
104 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
105 static	void vsw_conn_task(void *);
106 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
107 static	void vsw_next_milestone(vsw_ldc_t *);
108 static	int vsw_supported_version(vio_ver_msg_t *);
109 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
110 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
111 
112 /* Data processing routines */
113 static void vsw_process_pkt(void *);
114 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
115 static void vsw_process_ctrl_pkt(void *);
116 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
123 	uint32_t);
124 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
125 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
126 static void vsw_process_pkt_data(void *, void *, uint32_t);
127 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
128 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
129 
130 /* Switching/data transmit routines */
131 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
132 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
133 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
134 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
135 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 
138 /* Packet creation routines */
139 static void vsw_send_ver(void *);
140 static void vsw_send_attr(vsw_ldc_t *);
141 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
142 static void vsw_send_dring_info(vsw_ldc_t *);
143 static void vsw_send_rdx(vsw_ldc_t *);
144 
145 /* Dring routines */
146 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
147 static void vsw_create_privring(vsw_ldc_t *);
148 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
149 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
150     int *);
151 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
152 static int vsw_reclaim_dring(dring_info_t *dp, int start);
153 
154 static void vsw_set_lane_attr(vsw_t *, lane_t *);
155 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
156 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
157 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
158 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
159 
160 /* Rcv/Tx thread routines */
161 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
162 static void vsw_ldc_tx_worker(void *arg);
163 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
164 static void vsw_ldc_rx_worker(void *arg);
165 
166 /* Misc support routines */
167 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
168 static void vsw_free_ring(dring_info_t *);
169 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
170 static int vsw_get_same_dest_list(struct ether_header *ehp,
171     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
172 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
173 
174 /* Debugging routines */
175 static void dump_flags(uint64_t);
176 static void display_state(void);
177 static void display_lane(lane_t *);
178 static void display_ring(dring_info_t *);
179 
180 /*
181  * Functions imported from other files.
182  */
183 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
184 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
185 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
186 extern void vsw_del_mcst_port(vsw_port_t *port);
187 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
188 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
189 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
190 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
191 extern void vsw_create_vlans(void *arg, int type);
192 extern void vsw_destroy_vlans(void *arg, int type);
193 extern void vsw_vlan_add_ids(void *arg, int type);
194 extern void vsw_vlan_remove_ids(void *arg, int type);
195 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
196 	struct ether_header *ehp, uint16_t *vidp);
197 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
198 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
199 	mblk_t **npt);
200 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
201 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
202 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
203 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
204 extern void vsw_hio_stop_port(vsw_port_t *portp);
205 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
206 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
207 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
208 
209 
210 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
211 
212 /*
213  * Tunables used in this file.
214  */
215 extern int vsw_num_handshakes;
216 extern int vsw_wretries;
217 extern int vsw_desc_delay;
218 extern int vsw_read_attempts;
219 extern int vsw_ldc_tx_delay;
220 extern int vsw_ldc_tx_retries;
221 extern int vsw_ldc_retries;
222 extern int vsw_ldc_delay;
223 extern boolean_t vsw_ldc_rxthr_enabled;
224 extern boolean_t vsw_ldc_txthr_enabled;
225 extern uint32_t vsw_ntxds;
226 extern uint32_t vsw_max_tx_qcount;
227 extern uint32_t vsw_chain_len;
228 extern uint32_t vsw_mblk_size1;
229 extern uint32_t vsw_mblk_size2;
230 extern uint32_t vsw_mblk_size3;
231 extern uint32_t vsw_mblk_size4;
232 extern uint32_t vsw_num_mblks1;
233 extern uint32_t vsw_num_mblks2;
234 extern uint32_t vsw_num_mblks3;
235 extern uint32_t vsw_num_mblks4;
236 extern boolean_t vsw_obp_ver_proto_workaround;
237 extern uint32_t vsw_publish_macaddr_count;
238 extern boolean_t vsw_jumbo_rxpools;
239 
240 #define	LDC_ENTER_LOCK(ldcp)	\
241 				mutex_enter(&((ldcp)->ldc_cblock));\
242 				mutex_enter(&((ldcp)->ldc_rxlock));\
243 				mutex_enter(&((ldcp)->ldc_txlock));
244 #define	LDC_EXIT_LOCK(ldcp)	\
245 				mutex_exit(&((ldcp)->ldc_txlock));\
246 				mutex_exit(&((ldcp)->ldc_rxlock));\
247 				mutex_exit(&((ldcp)->ldc_cblock));
248 
249 #define	VSW_VER_EQ(ldcp, major, minor)	\
250 	((ldcp)->lane_out.ver_major == (major) &&	\
251 	    (ldcp)->lane_out.ver_minor == (minor))
252 
253 #define	VSW_VER_LT(ldcp, major, minor)	\
254 	(((ldcp)->lane_out.ver_major < (major)) ||	\
255 	    ((ldcp)->lane_out.ver_major == (major) &&	\
256 	    (ldcp)->lane_out.ver_minor < (minor)))
257 
258 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
259 	(((ldcp)->lane_out.ver_major > (major)) ||	\
260 	    ((ldcp)->lane_out.ver_major == (major) &&	\
261 	    (ldcp)->lane_out.ver_minor >= (minor)))
262 
263 /* supported versions */
264 static	ver_sup_t	vsw_versions[] = { {1, 4} };
265 
266 /*
267  * For the moment the state dump routines have their own
268  * private flag.
269  */
270 #define	DUMP_STATE	0
271 
272 #if DUMP_STATE
273 
274 #define	DUMP_TAG(tag) \
275 {			\
276 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
277 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
278 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
279 }
280 
281 #define	DUMP_TAG_PTR(tag) \
282 {			\
283 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
284 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
285 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
286 }
287 
288 #define	DUMP_FLAGS(flags) dump_flags(flags);
289 #define	DISPLAY_STATE()	display_state()
290 
291 #else
292 
293 #define	DUMP_TAG(tag)
294 #define	DUMP_TAG_PTR(tag)
295 #define	DUMP_FLAGS(state)
296 #define	DISPLAY_STATE()
297 
298 #endif	/* DUMP_STATE */
299 
300 /*
301  * Attach the specified port.
302  *
303  * Returns 0 on success, 1 on failure.
304  */
305 int
306 vsw_port_attach(vsw_port_t *port)
307 {
308 	vsw_t			*vswp = port->p_vswp;
309 	vsw_port_list_t		*plist = &vswp->plist;
310 	vsw_port_t		*p, **pp;
311 	int			i;
312 	int			nids = port->num_ldcs;
313 	uint64_t		*ldcids;
314 	int			rv;
315 
316 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
317 
318 	/* port already exists? */
319 	READ_ENTER(&plist->lockrw);
320 	for (p = plist->head; p != NULL; p = p->p_next) {
321 		if (p->p_instance == port->p_instance) {
322 			DWARN(vswp, "%s: port instance %d already attached",
323 			    __func__, p->p_instance);
324 			RW_EXIT(&plist->lockrw);
325 			return (1);
326 		}
327 	}
328 	RW_EXIT(&plist->lockrw);
329 
330 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
331 
332 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
333 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
334 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
335 
336 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
337 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
338 	port->state = VSW_PORT_INIT;
339 
340 	D2(vswp, "%s: %d nids", __func__, nids);
341 	ldcids = port->ldc_ids;
342 	for (i = 0; i < nids; i++) {
343 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
344 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
345 			DERR(vswp, "%s: ldc_attach failed", __func__);
346 			goto exit_error;
347 		}
348 	}
349 
350 	if (vswp->switching_setup_done == B_TRUE) {
351 		/*
352 		 * If the underlying network device has been setup,
353 		 * then open a mac client and porgram the mac address
354 		 * for this port.
355 		 */
356 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
357 		if (rv != 0) {
358 			goto exit_error;
359 		}
360 	}
361 
362 	/* create the fdb entry for this port/mac address */
363 	vsw_fdbe_add(vswp, port);
364 
365 	vsw_create_vlans(port, VSW_VNETPORT);
366 
367 	WRITE_ENTER(&plist->lockrw);
368 
369 	/* link it into the list of ports for this vsw instance */
370 	pp = (vsw_port_t **)(&plist->head);
371 	port->p_next = *pp;
372 	*pp = port;
373 	plist->num_ports++;
374 
375 	RW_EXIT(&plist->lockrw);
376 
377 	/*
378 	 * Initialise the port and any ldc's under it.
379 	 */
380 	(void) vsw_init_ldcs(port);
381 
382 	/* announce macaddr of vnet to the physical switch */
383 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
384 		vsw_publish_macaddr(vswp, port);
385 	}
386 
387 	D1(vswp, "%s: exit", __func__);
388 	return (0);
389 
390 exit_error:
391 	rw_destroy(&port->p_ldclist.lockrw);
392 
393 	cv_destroy(&port->state_cv);
394 	mutex_destroy(&port->state_lock);
395 
396 	rw_destroy(&port->maccl_rwlock);
397 	mutex_destroy(&port->tx_lock);
398 	mutex_destroy(&port->mca_lock);
399 	kmem_free(port, sizeof (vsw_port_t));
400 	return (1);
401 }
402 
403 /*
404  * Detach the specified port.
405  *
406  * Returns 0 on success, 1 on failure.
407  */
408 int
409 vsw_port_detach(vsw_t *vswp, int p_instance)
410 {
411 	vsw_port_t	*port = NULL;
412 	vsw_port_list_t	*plist = &vswp->plist;
413 
414 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
415 
416 	WRITE_ENTER(&plist->lockrw);
417 
418 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
419 		RW_EXIT(&plist->lockrw);
420 		return (1);
421 	}
422 
423 	if (vsw_plist_del_node(vswp, port)) {
424 		RW_EXIT(&plist->lockrw);
425 		return (1);
426 	}
427 
428 	/* cleanup any HybridIO for this port */
429 	vsw_hio_stop_port(port);
430 
431 	/*
432 	 * No longer need to hold writer lock on port list now
433 	 * that we have unlinked the target port from the list.
434 	 */
435 	RW_EXIT(&plist->lockrw);
436 
437 	/* Cleanup and close the mac client */
438 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
439 
440 	/* Remove the fdb entry for this port/mac address */
441 	vsw_fdbe_del(vswp, &(port->p_macaddr));
442 	vsw_destroy_vlans(port, VSW_VNETPORT);
443 
444 	/* Remove any multicast addresses.. */
445 	vsw_del_mcst_port(port);
446 
447 	vsw_port_delete(port);
448 
449 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
450 	return (0);
451 }
452 
453 /*
454  * Detach all active ports.
455  */
456 void
457 vsw_detach_ports(vsw_t *vswp)
458 {
459 	vsw_port_list_t 	*plist = &vswp->plist;
460 	vsw_port_t		*port = NULL;
461 
462 	D1(vswp, "%s: enter", __func__);
463 
464 	WRITE_ENTER(&plist->lockrw);
465 
466 	while ((port = plist->head) != NULL) {
467 		(void) vsw_plist_del_node(vswp, port);
468 
469 		/* cleanup any HybridIO for this port */
470 		vsw_hio_stop_port(port);
471 
472 		/* Cleanup and close the mac client */
473 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
474 
475 		/* Remove the fdb entry for this port/mac address */
476 		vsw_fdbe_del(vswp, &(port->p_macaddr));
477 		vsw_destroy_vlans(port, VSW_VNETPORT);
478 
479 		/* Remove any multicast addresses.. */
480 		vsw_del_mcst_port(port);
481 
482 		/*
483 		 * No longer need to hold the lock on the port list
484 		 * now that we have unlinked the target port from the
485 		 * list.
486 		 */
487 		RW_EXIT(&plist->lockrw);
488 		vsw_port_delete(port);
489 		WRITE_ENTER(&plist->lockrw);
490 	}
491 	RW_EXIT(&plist->lockrw);
492 
493 	D1(vswp, "%s: exit", __func__);
494 }
495 
496 /*
497  * Delete the specified port.
498  */
499 static void
500 vsw_port_delete(vsw_port_t *port)
501 {
502 	vsw_ldc_list_t 		*ldcl;
503 	vsw_t			*vswp = port->p_vswp;
504 	int			num_ldcs;
505 
506 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
507 
508 	vsw_uninit_ldcs(port);
509 
510 	/*
511 	 * Wait for any pending ctrl msg tasks which reference this
512 	 * port to finish.
513 	 */
514 	vsw_drain_port_taskq(port);
515 
516 	/*
517 	 * Wait for any active callbacks to finish
518 	 */
519 	vsw_drain_ldcs(port);
520 
521 	ldcl = &port->p_ldclist;
522 	num_ldcs = port->num_ldcs;
523 	WRITE_ENTER(&ldcl->lockrw);
524 	while (num_ldcs > 0) {
525 		vsw_ldc_detach(port, ldcl->head->ldc_id);
526 		num_ldcs--;
527 	}
528 	RW_EXIT(&ldcl->lockrw);
529 
530 	rw_destroy(&port->p_ldclist.lockrw);
531 
532 	rw_destroy(&port->maccl_rwlock);
533 	mutex_destroy(&port->mca_lock);
534 	mutex_destroy(&port->tx_lock);
535 
536 	cv_destroy(&port->state_cv);
537 	mutex_destroy(&port->state_lock);
538 
539 	if (port->num_ldcs != 0) {
540 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
541 		port->num_ldcs = 0;
542 	}
543 
544 	if (port->nvids != 0) {
545 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
546 	}
547 
548 	kmem_free(port, sizeof (vsw_port_t));
549 
550 	D1(vswp, "%s: exit", __func__);
551 }
552 
553 static int
554 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
555 {
556 	size_t		data_sz;
557 	int		rv;
558 	uint32_t	sz1 = 0;
559 	uint32_t	sz2 = 0;
560 	uint32_t	sz3 = 0;
561 	uint32_t	sz4 = 0;
562 
563 	/*
564 	 * We round up the mtu specified to be a multiple of 2K to limit the
565 	 * number of rx buffer pools created for a given mtu.
566 	 */
567 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
568 	data_sz = VNET_ROUNDUP_2K(data_sz);
569 
570 	/*
571 	 * If pool sizes are specified, use them. Note that the presence of
572 	 * the first tunable will be used as a hint.
573 	 */
574 	if (vsw_mblk_size1 != 0) {
575 		sz1 = vsw_mblk_size1;
576 		sz2 = vsw_mblk_size2;
577 		sz3 = vsw_mblk_size3;
578 		sz4 = vsw_mblk_size4;
579 
580 		if (sz4 == 0) { /* need 3 pools */
581 
582 			ldcp->max_rxpool_size = sz3;
583 			rv = vio_init_multipools(&ldcp->vmp,
584 			    VSW_NUM_VMPOOLS, sz1, sz2, sz3,
585 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
586 
587 		} else {
588 
589 			ldcp->max_rxpool_size = sz4;
590 			rv = vio_init_multipools(&ldcp->vmp,
591 			    VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
592 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
593 			    vsw_num_mblks4);
594 
595 		}
596 
597 		return (rv);
598 	}
599 
600 	/*
601 	 * Pool sizes are not specified. We select the pool sizes based on the
602 	 * mtu if vnet_jumbo_rxpools is enabled.
603 	 */
604 	if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
605 		/*
606 		 * Receive buffer pool allocation based on mtu is disabled.
607 		 * Use the default mechanism of standard size pool allocation.
608 		 */
609 		sz1 = VSW_MBLK_SZ_128;
610 		sz2 = VSW_MBLK_SZ_256;
611 		sz3 = VSW_MBLK_SZ_2048;
612 		ldcp->max_rxpool_size = sz3;
613 
614 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
615 		    sz1, sz2, sz3,
616 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
617 
618 		return (rv);
619 	}
620 
621 	switch (data_sz) {
622 
623 	case VNET_4K:
624 
625 		sz1 = VSW_MBLK_SZ_128;
626 		sz2 = VSW_MBLK_SZ_256;
627 		sz3 = VSW_MBLK_SZ_2048;
628 		sz4 = sz3 << 1;			/* 4K */
629 		ldcp->max_rxpool_size = sz4;
630 
631 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
632 		    sz1, sz2, sz3, sz4,
633 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
634 		    vsw_num_mblks4);
635 		break;
636 
637 	default:	/* data_sz:  4K+ to 16K */
638 
639 		sz1 = VSW_MBLK_SZ_256;
640 		sz2 = VSW_MBLK_SZ_2048;
641 		sz3 = data_sz >> 1;	/* Jumbo-size/2 */
642 		sz4 = data_sz;	/* Jumbo-size */
643 		ldcp->max_rxpool_size = sz4;
644 
645 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
646 		    sz1, sz2, sz3, sz4,
647 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
648 		    vsw_num_mblks4);
649 		break;
650 	}
651 
652 	return (rv);
653 
654 }
655 
656 /*
657  * Attach a logical domain channel (ldc) under a specified port.
658  *
659  * Returns 0 on success, 1 on failure.
660  */
661 static int
662 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
663 {
664 	vsw_t 		*vswp = port->p_vswp;
665 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
666 	vsw_ldc_t 	*ldcp = NULL;
667 	ldc_attr_t 	attr;
668 	ldc_status_t	istatus;
669 	int 		status = DDI_FAILURE;
670 	char		kname[MAXNAMELEN];
671 	enum		{ PROG_init = 0x0,
672 			    PROG_callback = 0x1, PROG_rx_thread = 0x2,
673 			    PROG_tx_thread = 0x4}
674 			progress;
675 
676 	progress = PROG_init;
677 
678 	D1(vswp, "%s: enter", __func__);
679 
680 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
681 	if (ldcp == NULL) {
682 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
683 		return (1);
684 	}
685 	ldcp->ldc_id = ldc_id;
686 
687 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
688 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
689 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
690 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
691 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
692 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
693 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
694 
695 	/* required for handshake with peer */
696 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
697 	ldcp->peer_session = 0;
698 	ldcp->session_status = 0;
699 	ldcp->hss_id = 1;	/* Initial handshake session id */
700 
701 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
702 
703 	/* only set for outbound lane, inbound set by peer */
704 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
705 
706 	attr.devclass = LDC_DEV_NT_SVC;
707 	attr.instance = ddi_get_instance(vswp->dip);
708 	attr.mode = LDC_MODE_UNRELIABLE;
709 	attr.mtu = VSW_LDC_MTU;
710 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
711 	if (status != 0) {
712 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
713 		    __func__, ldc_id, status);
714 		goto ldc_attach_fail;
715 	}
716 
717 	if (vsw_ldc_rxthr_enabled) {
718 		ldcp->rx_thr_flags = 0;
719 
720 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
721 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
722 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
723 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
724 
725 		progress |= PROG_rx_thread;
726 		if (ldcp->rx_thread == NULL) {
727 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
728 			    __func__, ldc_id);
729 			goto ldc_attach_fail;
730 		}
731 	}
732 
733 	if (vsw_ldc_txthr_enabled) {
734 		ldcp->tx_thr_flags = 0;
735 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
736 
737 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
738 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
739 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
740 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
741 
742 		progress |= PROG_tx_thread;
743 		if (ldcp->tx_thread == NULL) {
744 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
745 			    __func__, ldc_id);
746 			goto ldc_attach_fail;
747 		}
748 	}
749 
750 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
751 	if (status != 0) {
752 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
753 		    __func__, ldc_id, status);
754 		(void) ldc_fini(ldcp->ldc_handle);
755 		goto ldc_attach_fail;
756 	}
757 	/*
758 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
759 	 * data msgs, including raw data msgs used to recv priority frames.
760 	 */
761 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
762 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
763 
764 	progress |= PROG_callback;
765 
766 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
767 
768 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
769 		DERR(vswp, "%s: ldc_status failed", __func__);
770 		mutex_destroy(&ldcp->status_lock);
771 		goto ldc_attach_fail;
772 	}
773 
774 	ldcp->ldc_status = istatus;
775 	ldcp->ldc_port = port;
776 	ldcp->ldc_vswp = vswp;
777 
778 	vsw_reset_vnet_proto_ops(ldcp);
779 
780 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
781 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
782 	    kname, &ldcp->ldc_stats);
783 	if (ldcp->ksp == NULL) {
784 		DERR(vswp, "%s: kstats setup failed", __func__);
785 		goto ldc_attach_fail;
786 	}
787 
788 	/* link it into the list of channels for this port */
789 	WRITE_ENTER(&ldcl->lockrw);
790 	ldcp->ldc_next = ldcl->head;
791 	ldcl->head = ldcp;
792 	RW_EXIT(&ldcl->lockrw);
793 
794 	D1(vswp, "%s: exit", __func__);
795 	return (0);
796 
797 ldc_attach_fail:
798 
799 	if (progress & PROG_callback) {
800 		(void) ldc_unreg_callback(ldcp->ldc_handle);
801 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
802 	}
803 
804 	if (progress & PROG_rx_thread) {
805 		if (ldcp->rx_thread != NULL) {
806 			vsw_stop_rx_thread(ldcp);
807 		}
808 		mutex_destroy(&ldcp->rx_thr_lock);
809 		cv_destroy(&ldcp->rx_thr_cv);
810 	}
811 
812 	if (progress & PROG_tx_thread) {
813 		if (ldcp->tx_thread != NULL) {
814 			vsw_stop_tx_thread(ldcp);
815 		}
816 		mutex_destroy(&ldcp->tx_thr_lock);
817 		cv_destroy(&ldcp->tx_thr_cv);
818 	}
819 	if (ldcp->ksp != NULL) {
820 		vgen_destroy_kstats(ldcp->ksp);
821 	}
822 	mutex_destroy(&ldcp->ldc_txlock);
823 	mutex_destroy(&ldcp->ldc_rxlock);
824 	mutex_destroy(&ldcp->ldc_cblock);
825 	mutex_destroy(&ldcp->drain_cv_lock);
826 
827 	cv_destroy(&ldcp->drain_cv);
828 
829 	rw_destroy(&ldcp->lane_in.dlistrw);
830 	rw_destroy(&ldcp->lane_out.dlistrw);
831 
832 	kmem_free(ldcp, sizeof (vsw_ldc_t));
833 
834 	return (1);
835 }
836 
837 /*
838  * Detach a logical domain channel (ldc) belonging to a
839  * particular port.
840  */
841 static void
842 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
843 {
844 	vsw_t 		*vswp = port->p_vswp;
845 	vsw_ldc_t 	*ldcp, *prev_ldcp;
846 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
847 	int 		rv;
848 	int		retries = 0;
849 
850 	prev_ldcp = ldcl->head;
851 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
852 		if (ldcp->ldc_id == ldc_id) {
853 			break;
854 		}
855 	}
856 
857 	/* specified ldc id not found */
858 	ASSERT(ldcp != NULL);
859 
860 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
861 
862 	/* Stop the receive thread */
863 	if (ldcp->rx_thread != NULL) {
864 		vsw_stop_rx_thread(ldcp);
865 		mutex_destroy(&ldcp->rx_thr_lock);
866 		cv_destroy(&ldcp->rx_thr_cv);
867 	}
868 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
869 
870 	/* Stop the tx thread */
871 	if (ldcp->tx_thread != NULL) {
872 		vsw_stop_tx_thread(ldcp);
873 		mutex_destroy(&ldcp->tx_thr_lock);
874 		cv_destroy(&ldcp->tx_thr_cv);
875 		if (ldcp->tx_mhead != NULL) {
876 			freemsgchain(ldcp->tx_mhead);
877 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
878 			ldcp->tx_cnt = 0;
879 		}
880 	}
881 
882 	/* Destory kstats */
883 	vgen_destroy_kstats(ldcp->ksp);
884 
885 	/*
886 	 * Before we can close the channel we must release any mapped
887 	 * resources (e.g. drings).
888 	 */
889 	vsw_free_lane_resources(ldcp, INBOUND);
890 	vsw_free_lane_resources(ldcp, OUTBOUND);
891 
892 	/*
893 	 * Close the channel, retry on EAAGIN.
894 	 */
895 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
896 		if (++retries > vsw_ldc_retries) {
897 			break;
898 		}
899 		drv_usecwait(vsw_ldc_delay);
900 	}
901 	if (rv != 0) {
902 		cmn_err(CE_NOTE,
903 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
904 		    vswp->instance, rv, ldcp->ldc_id);
905 	}
906 
907 	(void) ldc_fini(ldcp->ldc_handle);
908 
909 	ldcp->ldc_status = LDC_INIT;
910 	ldcp->ldc_handle = NULL;
911 	ldcp->ldc_vswp = NULL;
912 
913 
914 	/*
915 	 * Most likely some mblks are still in use and
916 	 * have not been returned to the pool. These mblks are
917 	 * added to the pool that is maintained in the device instance.
918 	 * Another attempt will be made to destroy the pool
919 	 * when the device detaches.
920 	 */
921 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
922 
923 	/* unlink it from the list */
924 	prev_ldcp = ldcp->ldc_next;
925 
926 	mutex_destroy(&ldcp->ldc_txlock);
927 	mutex_destroy(&ldcp->ldc_rxlock);
928 	mutex_destroy(&ldcp->ldc_cblock);
929 	cv_destroy(&ldcp->drain_cv);
930 	mutex_destroy(&ldcp->drain_cv_lock);
931 	mutex_destroy(&ldcp->status_lock);
932 	rw_destroy(&ldcp->lane_in.dlistrw);
933 	rw_destroy(&ldcp->lane_out.dlistrw);
934 
935 	kmem_free(ldcp, sizeof (vsw_ldc_t));
936 }
937 
938 /*
939  * Open and attempt to bring up the channel. Note that channel
940  * can only be brought up if peer has also opened channel.
941  *
942  * Returns 0 if can open and bring up channel, otherwise
943  * returns 1.
944  */
945 static int
946 vsw_ldc_init(vsw_ldc_t *ldcp)
947 {
948 	vsw_t 		*vswp = ldcp->ldc_vswp;
949 	ldc_status_t	istatus = 0;
950 	int		rv;
951 
952 	D1(vswp, "%s: enter", __func__);
953 
954 	LDC_ENTER_LOCK(ldcp);
955 
956 	/* don't start at 0 in case clients don't like that */
957 	ldcp->next_ident = 1;
958 
959 	rv = ldc_open(ldcp->ldc_handle);
960 	if (rv != 0) {
961 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
962 		    __func__, ldcp->ldc_id, rv);
963 		LDC_EXIT_LOCK(ldcp);
964 		return (1);
965 	}
966 
967 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
968 		DERR(vswp, "%s: unable to get status", __func__);
969 		LDC_EXIT_LOCK(ldcp);
970 		return (1);
971 
972 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
973 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
974 		    __func__, ldcp->ldc_id, istatus);
975 		LDC_EXIT_LOCK(ldcp);
976 		return (1);
977 	}
978 
979 	mutex_enter(&ldcp->status_lock);
980 	ldcp->ldc_status = istatus;
981 	mutex_exit(&ldcp->status_lock);
982 
983 	rv = ldc_up(ldcp->ldc_handle);
984 	if (rv != 0) {
985 		/*
986 		 * Not a fatal error for ldc_up() to fail, as peer
987 		 * end point may simply not be ready yet.
988 		 */
989 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
990 		    ldcp->ldc_id, rv);
991 		LDC_EXIT_LOCK(ldcp);
992 		return (1);
993 	}
994 
995 	/*
996 	 * ldc_up() call is non-blocking so need to explicitly
997 	 * check channel status to see if in fact the channel
998 	 * is UP.
999 	 */
1000 	mutex_enter(&ldcp->status_lock);
1001 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
1002 		DERR(vswp, "%s: unable to get status", __func__);
1003 		mutex_exit(&ldcp->status_lock);
1004 		LDC_EXIT_LOCK(ldcp);
1005 		return (1);
1006 
1007 	}
1008 
1009 	if (ldcp->ldc_status == LDC_UP) {
1010 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
1011 		    ldcp->ldc_id, istatus);
1012 		mutex_exit(&ldcp->status_lock);
1013 		LDC_EXIT_LOCK(ldcp);
1014 
1015 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1016 		return (0);
1017 	}
1018 
1019 	mutex_exit(&ldcp->status_lock);
1020 	LDC_EXIT_LOCK(ldcp);
1021 
1022 	D1(vswp, "%s: exit", __func__);
1023 	return (0);
1024 }
1025 
1026 /* disable callbacks on the channel */
1027 static void
1028 vsw_ldc_uninit(vsw_ldc_t *ldcp)
1029 {
1030 	vsw_t	*vswp = ldcp->ldc_vswp;
1031 	int	rv;
1032 
1033 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
1034 
1035 	LDC_ENTER_LOCK(ldcp);
1036 
1037 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
1038 	if (rv != 0) {
1039 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
1040 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
1041 	}
1042 
1043 	mutex_enter(&ldcp->status_lock);
1044 	ldcp->ldc_status = LDC_INIT;
1045 	mutex_exit(&ldcp->status_lock);
1046 
1047 	LDC_EXIT_LOCK(ldcp);
1048 
1049 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
1050 }
1051 
1052 static int
1053 vsw_init_ldcs(vsw_port_t *port)
1054 {
1055 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1056 	vsw_ldc_t	*ldcp;
1057 
1058 	READ_ENTER(&ldcl->lockrw);
1059 	ldcp =  ldcl->head;
1060 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1061 		(void) vsw_ldc_init(ldcp);
1062 	}
1063 	RW_EXIT(&ldcl->lockrw);
1064 
1065 	return (0);
1066 }
1067 
1068 static void
1069 vsw_uninit_ldcs(vsw_port_t *port)
1070 {
1071 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1072 	vsw_ldc_t	*ldcp;
1073 
1074 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1075 
1076 	READ_ENTER(&ldcl->lockrw);
1077 	ldcp =  ldcl->head;
1078 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1079 		vsw_ldc_uninit(ldcp);
1080 	}
1081 	RW_EXIT(&ldcl->lockrw);
1082 
1083 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1084 }
1085 
1086 /*
1087  * Wait until the callback(s) associated with the ldcs under the specified
1088  * port have completed.
1089  *
1090  * Prior to this function being invoked each channel under this port
1091  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1092  *
1093  * A short explaination of what we are doing below..
1094  *
1095  * The simplest approach would be to have a reference counter in
1096  * the ldc structure which is increment/decremented by the callbacks as
1097  * they use the channel. The drain function could then simply disable any
1098  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1099  * there is a tiny window here - before the callback is able to get the lock
1100  * on the channel it is interrupted and this function gets to execute. It
1101  * sees that the ref count is zero and believes its free to delete the
1102  * associated data structures.
1103  *
1104  * We get around this by taking advantage of the fact that before the ldc
1105  * framework invokes a callback it sets a flag to indicate that there is a
1106  * callback active (or about to become active). If when we attempt to
1107  * unregister a callback when this active flag is set then the unregister
1108  * will fail with EWOULDBLOCK.
1109  *
1110  * If the unregister fails we do a cv_timedwait. We will either be signaled
1111  * by the callback as it is exiting (note we have to wait a short period to
1112  * allow the callback to return fully to the ldc framework and it to clear
1113  * the active flag), or by the timer expiring. In either case we again attempt
1114  * the unregister. We repeat this until we can succesfully unregister the
1115  * callback.
1116  *
1117  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1118  * the case where the callback has finished but the ldc framework has not yet
1119  * cleared the active flag. In this case we would never get a cv_signal.
1120  */
1121 static void
1122 vsw_drain_ldcs(vsw_port_t *port)
1123 {
1124 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1125 	vsw_ldc_t	*ldcp;
1126 	vsw_t		*vswp = port->p_vswp;
1127 
1128 	D1(vswp, "%s: enter", __func__);
1129 
1130 	READ_ENTER(&ldcl->lockrw);
1131 
1132 	ldcp = ldcl->head;
1133 
1134 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1135 		/*
1136 		 * If we can unregister the channel callback then we
1137 		 * know that there is no callback either running or
1138 		 * scheduled to run for this channel so move on to next
1139 		 * channel in the list.
1140 		 */
1141 		mutex_enter(&ldcp->drain_cv_lock);
1142 
1143 		/* prompt active callbacks to quit */
1144 		ldcp->drain_state = VSW_LDC_DRAINING;
1145 
1146 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1147 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1148 			    ldcp->ldc_id);
1149 			mutex_exit(&ldcp->drain_cv_lock);
1150 			continue;
1151 		} else {
1152 			/*
1153 			 * If we end up here we know that either 1) a callback
1154 			 * is currently executing, 2) is about to start (i.e.
1155 			 * the ldc framework has set the active flag but
1156 			 * has not actually invoked the callback yet, or 3)
1157 			 * has finished and has returned to the ldc framework
1158 			 * but the ldc framework has not yet cleared the
1159 			 * active bit.
1160 			 *
1161 			 * Wait for it to finish.
1162 			 */
1163 			while (ldc_unreg_callback(ldcp->ldc_handle)
1164 			    == EWOULDBLOCK)
1165 				(void) cv_timedwait(&ldcp->drain_cv,
1166 				    &ldcp->drain_cv_lock, lbolt + hz);
1167 
1168 			mutex_exit(&ldcp->drain_cv_lock);
1169 			D2(vswp, "%s: unreg callback for chan %ld after "
1170 			    "timeout", __func__, ldcp->ldc_id);
1171 		}
1172 	}
1173 	RW_EXIT(&ldcl->lockrw);
1174 
1175 	D1(vswp, "%s: exit", __func__);
1176 }
1177 
1178 /*
1179  * Wait until all tasks which reference this port have completed.
1180  *
1181  * Prior to this function being invoked each channel under this port
1182  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1183  */
1184 static void
1185 vsw_drain_port_taskq(vsw_port_t *port)
1186 {
1187 	vsw_t		*vswp = port->p_vswp;
1188 
1189 	D1(vswp, "%s: enter", __func__);
1190 
1191 	/*
1192 	 * Mark the port as in the process of being detached, and
1193 	 * dispatch a marker task to the queue so we know when all
1194 	 * relevant tasks have completed.
1195 	 */
1196 	mutex_enter(&port->state_lock);
1197 	port->state = VSW_PORT_DETACHING;
1198 
1199 	if ((vswp->taskq_p == NULL) ||
1200 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1201 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1202 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1203 		    vswp->instance);
1204 		mutex_exit(&port->state_lock);
1205 		return;
1206 	}
1207 
1208 	/*
1209 	 * Wait for the marker task to finish.
1210 	 */
1211 	while (port->state != VSW_PORT_DETACHABLE)
1212 		cv_wait(&port->state_cv, &port->state_lock);
1213 
1214 	mutex_exit(&port->state_lock);
1215 
1216 	D1(vswp, "%s: exit", __func__);
1217 }
1218 
1219 static void
1220 vsw_marker_task(void *arg)
1221 {
1222 	vsw_port_t	*port = arg;
1223 	vsw_t		*vswp = port->p_vswp;
1224 
1225 	D1(vswp, "%s: enter", __func__);
1226 
1227 	mutex_enter(&port->state_lock);
1228 
1229 	/*
1230 	 * No further tasks should be dispatched which reference
1231 	 * this port so ok to mark it as safe to detach.
1232 	 */
1233 	port->state = VSW_PORT_DETACHABLE;
1234 
1235 	cv_signal(&port->state_cv);
1236 
1237 	mutex_exit(&port->state_lock);
1238 
1239 	D1(vswp, "%s: exit", __func__);
1240 }
1241 
1242 vsw_port_t *
1243 vsw_lookup_port(vsw_t *vswp, int p_instance)
1244 {
1245 	vsw_port_list_t *plist = &vswp->plist;
1246 	vsw_port_t	*port;
1247 
1248 	for (port = plist->head; port != NULL; port = port->p_next) {
1249 		if (port->p_instance == p_instance) {
1250 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1251 			return (port);
1252 		}
1253 	}
1254 
1255 	return (NULL);
1256 }
1257 
1258 void
1259 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1260 {
1261 	vsw_ldc_list_t 	*ldclp;
1262 	vsw_ldc_t	*ldcp;
1263 
1264 	ldclp = &portp->p_ldclist;
1265 
1266 	READ_ENTER(&ldclp->lockrw);
1267 
1268 	/*
1269 	 * NOTE: for now, we will assume we have a single channel.
1270 	 */
1271 	if (ldclp->head == NULL) {
1272 		RW_EXIT(&ldclp->lockrw);
1273 		return;
1274 	}
1275 	ldcp = ldclp->head;
1276 
1277 	mutex_enter(&ldcp->ldc_cblock);
1278 
1279 	/*
1280 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1281 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1282 	 */
1283 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1284 	    portp->nvids != 0) {
1285 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1286 	}
1287 
1288 	mutex_exit(&ldcp->ldc_cblock);
1289 
1290 	RW_EXIT(&ldclp->lockrw);
1291 }
1292 
1293 void
1294 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1295 {
1296 	vsw_ldc_list_t	*ldclp;
1297 	vsw_ldc_t	*ldcp;
1298 
1299 	ldclp = &portp->p_ldclist;
1300 
1301 	READ_ENTER(&ldclp->lockrw);
1302 
1303 	/*
1304 	 * NOTE: for now, we will assume we have a single channel.
1305 	 */
1306 	if (ldclp->head == NULL) {
1307 		RW_EXIT(&ldclp->lockrw);
1308 		return;
1309 	}
1310 	ldcp = ldclp->head;
1311 
1312 	mutex_enter(&ldcp->ldc_cblock);
1313 
1314 	/*
1315 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1316 	 * to trigger re-negotiation, which inturn trigger HybridIO
1317 	 * setup/cleanup.
1318 	 */
1319 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1320 	    (portp->p_hio_capable == B_TRUE)) {
1321 		if (immediate == B_TRUE) {
1322 			(void) ldc_down(ldcp->ldc_handle);
1323 		} else {
1324 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1325 		}
1326 	}
1327 
1328 	mutex_exit(&ldcp->ldc_cblock);
1329 
1330 	RW_EXIT(&ldclp->lockrw);
1331 }
1332 
1333 void
1334 vsw_port_reset(vsw_port_t *portp)
1335 {
1336 	vsw_ldc_list_t 	*ldclp;
1337 	vsw_ldc_t	*ldcp;
1338 
1339 	ldclp = &portp->p_ldclist;
1340 
1341 	READ_ENTER(&ldclp->lockrw);
1342 
1343 	/*
1344 	 * NOTE: for now, we will assume we have a single channel.
1345 	 */
1346 	if (ldclp->head == NULL) {
1347 		RW_EXIT(&ldclp->lockrw);
1348 		return;
1349 	}
1350 	ldcp = ldclp->head;
1351 
1352 	mutex_enter(&ldcp->ldc_cblock);
1353 
1354 	/*
1355 	 * reset channel and terminate the connection.
1356 	 */
1357 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1358 
1359 	mutex_exit(&ldcp->ldc_cblock);
1360 
1361 	RW_EXIT(&ldclp->lockrw);
1362 }
1363 
1364 void
1365 vsw_reset_ports(vsw_t *vswp)
1366 {
1367 	vsw_port_list_t	*plist = &vswp->plist;
1368 	vsw_port_t	*portp;
1369 
1370 	READ_ENTER(&plist->lockrw);
1371 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1372 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1373 			vsw_hio_stop_port(portp);
1374 		}
1375 		vsw_port_reset(portp);
1376 	}
1377 	RW_EXIT(&plist->lockrw);
1378 }
1379 
1380 
1381 /*
1382  * Search for and remove the specified port from the port
1383  * list. Returns 0 if able to locate and remove port, otherwise
1384  * returns 1.
1385  */
1386 static int
1387 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1388 {
1389 	vsw_port_list_t *plist = &vswp->plist;
1390 	vsw_port_t	*curr_p, *prev_p;
1391 
1392 	if (plist->head == NULL)
1393 		return (1);
1394 
1395 	curr_p = prev_p = plist->head;
1396 
1397 	while (curr_p != NULL) {
1398 		if (curr_p == port) {
1399 			if (prev_p == curr_p) {
1400 				plist->head = curr_p->p_next;
1401 			} else {
1402 				prev_p->p_next = curr_p->p_next;
1403 			}
1404 			plist->num_ports--;
1405 			break;
1406 		} else {
1407 			prev_p = curr_p;
1408 			curr_p = curr_p->p_next;
1409 		}
1410 	}
1411 	return (0);
1412 }
1413 
1414 /*
1415  * Interrupt handler for ldc messages.
1416  */
1417 static uint_t
1418 vsw_ldc_cb(uint64_t event, caddr_t arg)
1419 {
1420 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1421 	vsw_t 		*vswp = ldcp->ldc_vswp;
1422 
1423 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1424 
1425 	mutex_enter(&ldcp->ldc_cblock);
1426 	ldcp->ldc_stats.callbacks++;
1427 
1428 	mutex_enter(&ldcp->status_lock);
1429 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1430 		mutex_exit(&ldcp->status_lock);
1431 		mutex_exit(&ldcp->ldc_cblock);
1432 		return (LDC_SUCCESS);
1433 	}
1434 	mutex_exit(&ldcp->status_lock);
1435 
1436 	if (event & LDC_EVT_UP) {
1437 		/*
1438 		 * Channel has come up.
1439 		 */
1440 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1441 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1442 
1443 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1444 
1445 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1446 	}
1447 
1448 	if (event & LDC_EVT_READ) {
1449 		/*
1450 		 * Data available for reading.
1451 		 */
1452 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1453 		    __func__, ldcp->ldc_id, event);
1454 
1455 		if (ldcp->rx_thread != NULL) {
1456 			/*
1457 			 * If the receive thread is enabled, then
1458 			 * wakeup the receive thread to process the
1459 			 * LDC messages.
1460 			 */
1461 			mutex_exit(&ldcp->ldc_cblock);
1462 			mutex_enter(&ldcp->rx_thr_lock);
1463 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1464 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1465 				cv_signal(&ldcp->rx_thr_cv);
1466 			}
1467 			mutex_exit(&ldcp->rx_thr_lock);
1468 			mutex_enter(&ldcp->ldc_cblock);
1469 		} else {
1470 			vsw_process_pkt(ldcp);
1471 		}
1472 
1473 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1474 
1475 		goto vsw_cb_exit;
1476 	}
1477 
1478 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1479 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1480 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1481 
1482 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1483 	}
1484 
1485 	/*
1486 	 * Catch either LDC_EVT_WRITE which we don't support or any
1487 	 * unknown event.
1488 	 */
1489 	if (event &
1490 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1491 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1492 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1493 	}
1494 
1495 vsw_cb_exit:
1496 	mutex_exit(&ldcp->ldc_cblock);
1497 
1498 	/*
1499 	 * Let the drain function know we are finishing if it
1500 	 * is waiting.
1501 	 */
1502 	mutex_enter(&ldcp->drain_cv_lock);
1503 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1504 		cv_signal(&ldcp->drain_cv);
1505 	mutex_exit(&ldcp->drain_cv_lock);
1506 
1507 	return (LDC_SUCCESS);
1508 }
1509 
1510 /*
1511  * Reinitialise data structures associated with the channel.
1512  */
1513 static void
1514 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1515 {
1516 	vsw_t		*vswp = ldcp->ldc_vswp;
1517 	vsw_port_t	*port;
1518 	vsw_ldc_list_t	*ldcl;
1519 
1520 	D1(vswp, "%s: enter", __func__);
1521 
1522 	/* free receive mblk pools for the channel */
1523 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
1524 
1525 	port = ldcp->ldc_port;
1526 	ldcl = &port->p_ldclist;
1527 
1528 	READ_ENTER(&ldcl->lockrw);
1529 
1530 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1531 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1532 
1533 	vsw_free_lane_resources(ldcp, INBOUND);
1534 	vsw_free_lane_resources(ldcp, OUTBOUND);
1535 	RW_EXIT(&ldcl->lockrw);
1536 
1537 	ldcp->lane_in.lstate = 0;
1538 	ldcp->lane_out.lstate = 0;
1539 
1540 	/* Remove the fdb entry for this port/mac address */
1541 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1542 
1543 	/* remove the port from vlans it has been assigned to */
1544 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1545 
1546 	/*
1547 	 * Remove parent port from any multicast groups
1548 	 * it may have registered with. Client must resend
1549 	 * multicast add command after handshake completes.
1550 	 */
1551 	vsw_del_mcst_port(port);
1552 
1553 	ldcp->peer_session = 0;
1554 	ldcp->session_status = 0;
1555 	ldcp->hcnt = 0;
1556 	ldcp->hphase = VSW_MILESTONE0;
1557 
1558 	vsw_reset_vnet_proto_ops(ldcp);
1559 
1560 	D1(vswp, "%s: exit", __func__);
1561 }
1562 
1563 /*
1564  * Process a connection event.
1565  *
1566  * Note - care must be taken to ensure that this function is
1567  * not called with the dlistrw lock held.
1568  */
1569 static void
1570 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1571 {
1572 	vsw_t		*vswp = ldcp->ldc_vswp;
1573 	vsw_conn_evt_t	*conn = NULL;
1574 
1575 	D1(vswp, "%s: enter", __func__);
1576 
1577 	/*
1578 	 * Check if either a reset or restart event is pending
1579 	 * or in progress. If so just return.
1580 	 *
1581 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1582 	 * being received by the callback handler, or a ECONNRESET error
1583 	 * code being returned from a ldc_read() or ldc_write() call.
1584 	 *
1585 	 * A VSW_CONN_RESTART event occurs when some error checking code
1586 	 * decides that there is a problem with data from the channel,
1587 	 * and that the handshake should be restarted.
1588 	 */
1589 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1590 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1591 		return;
1592 
1593 	/*
1594 	 * If it is an LDC_UP event we first check the recorded
1595 	 * state of the channel. If this is UP then we know that
1596 	 * the channel moving to the UP state has already been dealt
1597 	 * with and don't need to dispatch a  new task.
1598 	 *
1599 	 * The reason for this check is that when we do a ldc_up(),
1600 	 * depending on the state of the peer, we may or may not get
1601 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1602 	 * every time we do ldc_up() we explicitly check the channel
1603 	 * status to see has it come up (ldc_up() is asynch and will
1604 	 * complete at some undefined time), and take the appropriate
1605 	 * action.
1606 	 *
1607 	 * The flip side of this is that we may get a LDC_UP event
1608 	 * when we have already seen that the channel is up and have
1609 	 * dealt with that.
1610 	 */
1611 	mutex_enter(&ldcp->status_lock);
1612 	if (evt == VSW_CONN_UP) {
1613 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1614 			mutex_exit(&ldcp->status_lock);
1615 			return;
1616 		}
1617 	}
1618 	mutex_exit(&ldcp->status_lock);
1619 
1620 	/*
1621 	 * The transaction group id allows us to identify and discard
1622 	 * any tasks which are still pending on the taskq and refer
1623 	 * to the handshake session we are about to restart or reset.
1624 	 * These stale messages no longer have any real meaning.
1625 	 */
1626 	(void) atomic_inc_32(&ldcp->hss_id);
1627 
1628 	ASSERT(vswp->taskq_p != NULL);
1629 
1630 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1631 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1632 		    " connection event", vswp->instance);
1633 		goto err_exit;
1634 	}
1635 
1636 	conn->evt = evt;
1637 	conn->ldcp = ldcp;
1638 
1639 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1640 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1641 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1642 		    vswp->instance);
1643 
1644 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1645 		goto err_exit;
1646 	}
1647 
1648 	D1(vswp, "%s: exit", __func__);
1649 	return;
1650 
1651 err_exit:
1652 	/*
1653 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1654 	 * that future requests will at least be attempted and will hopefully
1655 	 * succeed.
1656 	 */
1657 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1658 		ldcp->reset_active = 0;
1659 }
1660 
1661 /*
1662  * Deal with events relating to a connection. Invoked from a taskq.
1663  */
1664 static void
1665 vsw_conn_task(void *arg)
1666 {
1667 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1668 	vsw_ldc_t	*ldcp = NULL;
1669 	vsw_port_t	*portp;
1670 	vsw_t		*vswp = NULL;
1671 	uint16_t	evt;
1672 	ldc_status_t	curr_status;
1673 
1674 	ldcp = conn->ldcp;
1675 	evt = conn->evt;
1676 	vswp = ldcp->ldc_vswp;
1677 	portp = ldcp->ldc_port;
1678 
1679 	D1(vswp, "%s: enter", __func__);
1680 
1681 	/* can safely free now have copied out data */
1682 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1683 
1684 	mutex_enter(&ldcp->status_lock);
1685 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1686 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1687 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1688 		mutex_exit(&ldcp->status_lock);
1689 		return;
1690 	}
1691 
1692 	/*
1693 	 * If we wish to restart the handshake on this channel, then if
1694 	 * the channel is UP we bring it DOWN to flush the underlying
1695 	 * ldc queue.
1696 	 */
1697 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1698 		(void) ldc_down(ldcp->ldc_handle);
1699 
1700 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1701 		vsw_hio_stop(vswp, ldcp);
1702 	}
1703 
1704 	/*
1705 	 * re-init all the associated data structures.
1706 	 */
1707 	vsw_ldc_reinit(ldcp);
1708 
1709 	/*
1710 	 * Bring the channel back up (note it does no harm to
1711 	 * do this even if the channel is already UP, Just
1712 	 * becomes effectively a no-op).
1713 	 */
1714 	(void) ldc_up(ldcp->ldc_handle);
1715 
1716 	/*
1717 	 * Check if channel is now UP. This will only happen if
1718 	 * peer has also done a ldc_up().
1719 	 */
1720 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1721 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1722 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1723 		mutex_exit(&ldcp->status_lock);
1724 		return;
1725 	}
1726 
1727 	ldcp->ldc_status = curr_status;
1728 
1729 	/* channel UP so restart handshake by sending version info */
1730 	if (curr_status == LDC_UP) {
1731 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1732 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1733 			    " handshake attempts (%d) on channel %ld",
1734 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1735 			mutex_exit(&ldcp->status_lock);
1736 			return;
1737 		}
1738 
1739 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1740 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1741 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1742 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1743 			    vswp->instance);
1744 
1745 			/*
1746 			 * Don't count as valid restart attempt if couldn't
1747 			 * send version msg.
1748 			 */
1749 			if (ldcp->hcnt > 0)
1750 				ldcp->hcnt--;
1751 		}
1752 	}
1753 
1754 	/*
1755 	 * Mark that the process is complete by clearing the flag.
1756 	 *
1757 	 * Note is it possible that the taskq dispatch above may have failed,
1758 	 * most likely due to memory shortage. We still clear the flag so
1759 	 * future attempts will at least be attempted and will hopefully
1760 	 * succeed.
1761 	 */
1762 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1763 		ldcp->reset_active = 0;
1764 
1765 	mutex_exit(&ldcp->status_lock);
1766 
1767 	D1(vswp, "%s: exit", __func__);
1768 }
1769 
1770 /*
1771  * returns 0 if legal for event signified by flag to have
1772  * occured at the time it did. Otherwise returns 1.
1773  */
1774 int
1775 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1776 {
1777 	vsw_t		*vswp = ldcp->ldc_vswp;
1778 	uint64_t	state;
1779 	uint64_t	phase;
1780 
1781 	if (dir == INBOUND)
1782 		state = ldcp->lane_in.lstate;
1783 	else
1784 		state = ldcp->lane_out.lstate;
1785 
1786 	phase = ldcp->hphase;
1787 
1788 	switch (flag) {
1789 	case VSW_VER_INFO_RECV:
1790 		if (phase > VSW_MILESTONE0) {
1791 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1792 			    " when in state %d\n", ldcp->ldc_id, phase);
1793 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1794 			return (1);
1795 		}
1796 		break;
1797 
1798 	case VSW_VER_ACK_RECV:
1799 	case VSW_VER_NACK_RECV:
1800 		if (!(state & VSW_VER_INFO_SENT)) {
1801 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1802 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1803 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1804 			return (1);
1805 		} else
1806 			state &= ~VSW_VER_INFO_SENT;
1807 		break;
1808 
1809 	case VSW_ATTR_INFO_RECV:
1810 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1811 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1812 			    " when in state %d\n", ldcp->ldc_id, phase);
1813 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1814 			return (1);
1815 		}
1816 		break;
1817 
1818 	case VSW_ATTR_ACK_RECV:
1819 	case VSW_ATTR_NACK_RECV:
1820 		if (!(state & VSW_ATTR_INFO_SENT)) {
1821 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1822 			    " or ATTR_NACK when in state %d\n",
1823 			    ldcp->ldc_id, phase);
1824 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1825 			return (1);
1826 		} else
1827 			state &= ~VSW_ATTR_INFO_SENT;
1828 		break;
1829 
1830 	case VSW_DRING_INFO_RECV:
1831 		if (phase < VSW_MILESTONE1) {
1832 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1833 			    " when in state %d\n", ldcp->ldc_id, phase);
1834 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1835 			return (1);
1836 		}
1837 		break;
1838 
1839 	case VSW_DRING_ACK_RECV:
1840 	case VSW_DRING_NACK_RECV:
1841 		if (!(state & VSW_DRING_INFO_SENT)) {
1842 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1843 			    " or DRING_NACK when in state %d\n",
1844 			    ldcp->ldc_id, phase);
1845 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1846 			return (1);
1847 		} else
1848 			state &= ~VSW_DRING_INFO_SENT;
1849 		break;
1850 
1851 	case VSW_RDX_INFO_RECV:
1852 		if (phase < VSW_MILESTONE3) {
1853 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1854 			    " when in state %d\n", ldcp->ldc_id, phase);
1855 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1856 			return (1);
1857 		}
1858 		break;
1859 
1860 	case VSW_RDX_ACK_RECV:
1861 	case VSW_RDX_NACK_RECV:
1862 		if (!(state & VSW_RDX_INFO_SENT)) {
1863 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1864 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1865 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1866 			return (1);
1867 		} else
1868 			state &= ~VSW_RDX_INFO_SENT;
1869 		break;
1870 
1871 	case VSW_MCST_INFO_RECV:
1872 		if (phase < VSW_MILESTONE3) {
1873 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1874 			    " when in state %d\n", ldcp->ldc_id, phase);
1875 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1876 			return (1);
1877 		}
1878 		break;
1879 
1880 	default:
1881 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1882 		    ldcp->ldc_id, flag);
1883 		return (1);
1884 	}
1885 
1886 	if (dir == INBOUND)
1887 		ldcp->lane_in.lstate = state;
1888 	else
1889 		ldcp->lane_out.lstate = state;
1890 
1891 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1892 
1893 	return (0);
1894 }
1895 
1896 void
1897 vsw_next_milestone(vsw_ldc_t *ldcp)
1898 {
1899 	vsw_t		*vswp = ldcp->ldc_vswp;
1900 	vsw_port_t	*portp = ldcp->ldc_port;
1901 
1902 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1903 	    ldcp->ldc_id, ldcp->hphase);
1904 
1905 	DUMP_FLAGS(ldcp->lane_in.lstate);
1906 	DUMP_FLAGS(ldcp->lane_out.lstate);
1907 
1908 	switch (ldcp->hphase) {
1909 
1910 	case VSW_MILESTONE0:
1911 		/*
1912 		 * If we haven't started to handshake with our peer,
1913 		 * start to do so now.
1914 		 */
1915 		if (ldcp->lane_out.lstate == 0) {
1916 			D2(vswp, "%s: (chan %lld) starting handshake "
1917 			    "with peer", __func__, ldcp->ldc_id);
1918 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1919 		}
1920 
1921 		/*
1922 		 * Only way to pass this milestone is to have successfully
1923 		 * negotiated version info.
1924 		 */
1925 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1926 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1927 
1928 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1929 			    __func__, ldcp->ldc_id);
1930 
1931 			vsw_set_vnet_proto_ops(ldcp);
1932 
1933 			/*
1934 			 * Next milestone is passed when attribute
1935 			 * information has been successfully exchanged.
1936 			 */
1937 			ldcp->hphase = VSW_MILESTONE1;
1938 			vsw_send_attr(ldcp);
1939 
1940 		}
1941 		break;
1942 
1943 	case VSW_MILESTONE1:
1944 		/*
1945 		 * Only way to pass this milestone is to have successfully
1946 		 * negotiated attribute information.
1947 		 */
1948 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1949 
1950 			ldcp->hphase = VSW_MILESTONE2;
1951 
1952 			/*
1953 			 * If the peer device has said it wishes to
1954 			 * use descriptor rings then we send it our ring
1955 			 * info, otherwise we just set up a private ring
1956 			 * which we use an internal buffer
1957 			 */
1958 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1959 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1960 			    (VSW_VER_LT(ldcp, 1, 2) &&
1961 			    (ldcp->lane_in.xfer_mode ==
1962 			    VIO_DRING_MODE_V1_0))) {
1963 				vsw_send_dring_info(ldcp);
1964 			}
1965 		}
1966 		break;
1967 
1968 	case VSW_MILESTONE2:
1969 		/*
1970 		 * If peer has indicated in its attribute message that
1971 		 * it wishes to use descriptor rings then the only way
1972 		 * to pass this milestone is for us to have received
1973 		 * valid dring info.
1974 		 *
1975 		 * If peer is not using descriptor rings then just fall
1976 		 * through.
1977 		 */
1978 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1979 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1980 		    (VSW_VER_LT(ldcp, 1, 2) &&
1981 		    (ldcp->lane_in.xfer_mode ==
1982 		    VIO_DRING_MODE_V1_0))) {
1983 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1984 				break;
1985 		}
1986 
1987 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1988 		    __func__, ldcp->ldc_id);
1989 
1990 		ldcp->hphase = VSW_MILESTONE3;
1991 		vsw_send_rdx(ldcp);
1992 		break;
1993 
1994 	case VSW_MILESTONE3:
1995 		/*
1996 		 * Pass this milestone when all paramaters have been
1997 		 * successfully exchanged and RDX sent in both directions.
1998 		 *
1999 		 * Mark outbound lane as available to transmit data.
2000 		 */
2001 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
2002 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
2003 
2004 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2005 			    __func__, ldcp->ldc_id);
2006 			D2(vswp, "%s: ** handshake complete (0x%llx : "
2007 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
2008 			    ldcp->lane_out.lstate);
2009 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2010 			ldcp->hphase = VSW_MILESTONE4;
2011 			ldcp->hcnt = 0;
2012 			DISPLAY_STATE();
2013 			/* Start HIO if enabled and capable */
2014 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
2015 				D2(vswp, "%s: start HybridIO setup", __func__);
2016 				vsw_hio_start(vswp, ldcp);
2017 			}
2018 		} else {
2019 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
2020 			    __func__, ldcp->lane_in.lstate,
2021 			    ldcp->lane_out.lstate);
2022 		}
2023 		break;
2024 
2025 	case VSW_MILESTONE4:
2026 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2027 		    ldcp->ldc_id);
2028 		break;
2029 
2030 	default:
2031 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2032 		    ldcp->ldc_id, ldcp->hphase);
2033 	}
2034 
2035 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2036 	    ldcp->hphase);
2037 }
2038 
2039 /*
2040  * Check if major version is supported.
2041  *
2042  * Returns 0 if finds supported major number, and if necessary
2043  * adjusts the minor field.
2044  *
2045  * Returns 1 if can't match major number exactly. Sets mjor/minor
2046  * to next lowest support values, or to zero if no other values possible.
2047  */
2048 static int
2049 vsw_supported_version(vio_ver_msg_t *vp)
2050 {
2051 	int	i;
2052 
2053 	D1(NULL, "vsw_supported_version: enter");
2054 
2055 	for (i = 0; i < VSW_NUM_VER; i++) {
2056 		if (vsw_versions[i].ver_major == vp->ver_major) {
2057 			/*
2058 			 * Matching or lower major version found. Update
2059 			 * minor number if necessary.
2060 			 */
2061 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2062 				D2(NULL, "%s: adjusting minor value from %d "
2063 				    "to %d", __func__, vp->ver_minor,
2064 				    vsw_versions[i].ver_minor);
2065 				vp->ver_minor = vsw_versions[i].ver_minor;
2066 			}
2067 
2068 			return (0);
2069 		}
2070 
2071 		/*
2072 		 * If the message contains a higher major version number, set
2073 		 * the message's major/minor versions to the current values
2074 		 * and return false, so this message will get resent with
2075 		 * these values.
2076 		 */
2077 		if (vsw_versions[i].ver_major < vp->ver_major) {
2078 			D2(NULL, "%s: adjusting major and minor "
2079 			    "values to %d, %d\n",
2080 			    __func__, vsw_versions[i].ver_major,
2081 			    vsw_versions[i].ver_minor);
2082 			vp->ver_major = vsw_versions[i].ver_major;
2083 			vp->ver_minor = vsw_versions[i].ver_minor;
2084 			return (1);
2085 		}
2086 	}
2087 
2088 	/* No match was possible, zero out fields */
2089 	vp->ver_major = 0;
2090 	vp->ver_minor = 0;
2091 
2092 	D1(NULL, "vsw_supported_version: exit");
2093 
2094 	return (1);
2095 }
2096 
2097 /*
2098  * Set vnet-protocol-version dependent functions based on version.
2099  */
2100 static void
2101 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
2102 {
2103 	vsw_t	*vswp = ldcp->ldc_vswp;
2104 	lane_t	*lp = &ldcp->lane_out;
2105 
2106 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2107 		/*
2108 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
2109 		 * Support), set the mtu in our attributes to max_frame_size.
2110 		 */
2111 		lp->mtu = vswp->max_frame_size;
2112 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
2113 		/*
2114 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
2115 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
2116 		 */
2117 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
2118 	} else {
2119 		vsw_port_t	*portp = ldcp->ldc_port;
2120 		/*
2121 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2122 		 * We can negotiate that size with those peers provided only
2123 		 * pvid is defined for our peer and there are no vids. Then we
2124 		 * can send/recv only untagged frames of max size ETHERMAX.
2125 		 * Note that pvid of the peer can be different, as vsw has to
2126 		 * serve the vnet in that vlan even if itself is not assigned
2127 		 * to that vlan.
2128 		 */
2129 		if (portp->nvids == 0) {
2130 			lp->mtu = ETHERMAX;
2131 		}
2132 	}
2133 
2134 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2135 		/* Versions >= 1.2 */
2136 
2137 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2138 			/*
2139 			 * enable priority routines and pkt mode only if
2140 			 * at least one pri-eth-type is specified in MD.
2141 			 */
2142 			ldcp->tx = vsw_ldctx_pri;
2143 			ldcp->rx_pktdata = vsw_process_pkt_data;
2144 
2145 			/* set xfer mode for vsw_send_attr() */
2146 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2147 		} else {
2148 			/* no priority eth types defined in MD */
2149 
2150 			ldcp->tx = vsw_ldctx;
2151 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2152 
2153 			/* set xfer mode for vsw_send_attr() */
2154 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2155 		}
2156 
2157 	} else {
2158 		/* Versions prior to 1.2  */
2159 
2160 		vsw_reset_vnet_proto_ops(ldcp);
2161 	}
2162 }
2163 
2164 /*
2165  * Reset vnet-protocol-version dependent functions to v1.0.
2166  */
2167 static void
2168 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2169 {
2170 	lane_t	*lp = &ldcp->lane_out;
2171 
2172 	ldcp->tx = vsw_ldctx;
2173 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2174 
2175 	/* set xfer mode for vsw_send_attr() */
2176 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2177 }
2178 
2179 /*
2180  * Main routine for processing messages received over LDC.
2181  */
2182 static void
2183 vsw_process_pkt(void *arg)
2184 {
2185 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2186 	vsw_t 		*vswp = ldcp->ldc_vswp;
2187 	size_t		msglen;
2188 	vio_msg_tag_t	*tagp;
2189 	uint64_t	*ldcmsg;
2190 	int 		rv = 0;
2191 
2192 
2193 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2194 
2195 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2196 
2197 	ldcmsg = ldcp->ldcmsg;
2198 	/*
2199 	 * If channel is up read messages until channel is empty.
2200 	 */
2201 	do {
2202 		msglen = ldcp->msglen;
2203 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2204 
2205 		if (rv != 0) {
2206 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2207 			    __func__, ldcp->ldc_id, rv, msglen);
2208 		}
2209 
2210 		/* channel has been reset */
2211 		if (rv == ECONNRESET) {
2212 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2213 			break;
2214 		}
2215 
2216 		if (msglen == 0) {
2217 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2218 			    ldcp->ldc_id);
2219 			break;
2220 		}
2221 
2222 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2223 		    ldcp->ldc_id, msglen);
2224 
2225 		/*
2226 		 * Figure out what sort of packet we have gotten by
2227 		 * examining the msg tag, and then switch it appropriately.
2228 		 */
2229 		tagp = (vio_msg_tag_t *)ldcmsg;
2230 
2231 		switch (tagp->vio_msgtype) {
2232 		case VIO_TYPE_CTRL:
2233 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2234 			break;
2235 		case VIO_TYPE_DATA:
2236 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2237 			break;
2238 		case VIO_TYPE_ERR:
2239 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2240 			break;
2241 		default:
2242 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2243 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2244 			break;
2245 		}
2246 	} while (msglen);
2247 
2248 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2249 }
2250 
2251 /*
2252  * Dispatch a task to process a VIO control message.
2253  */
2254 static void
2255 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2256 {
2257 	vsw_ctrl_task_t		*ctaskp = NULL;
2258 	vsw_port_t		*port = ldcp->ldc_port;
2259 	vsw_t			*vswp = port->p_vswp;
2260 
2261 	D1(vswp, "%s: enter", __func__);
2262 
2263 	/*
2264 	 * We need to handle RDX ACK messages in-band as once they
2265 	 * are exchanged it is possible that we will get an
2266 	 * immediate (legitimate) data packet.
2267 	 */
2268 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2269 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2270 
2271 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2272 			return;
2273 
2274 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2275 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2276 		    "(ostate 0x%llx : hphase %d)", __func__,
2277 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2278 		vsw_next_milestone(ldcp);
2279 		return;
2280 	}
2281 
2282 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2283 
2284 	if (ctaskp == NULL) {
2285 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2286 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2287 		return;
2288 	}
2289 
2290 	ctaskp->ldcp = ldcp;
2291 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2292 	ctaskp->hss_id = ldcp->hss_id;
2293 
2294 	/*
2295 	 * Dispatch task to processing taskq if port is not in
2296 	 * the process of being detached.
2297 	 */
2298 	mutex_enter(&port->state_lock);
2299 	if (port->state == VSW_PORT_INIT) {
2300 		if ((vswp->taskq_p == NULL) ||
2301 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2302 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2303 			mutex_exit(&port->state_lock);
2304 			DERR(vswp, "%s: unable to dispatch task to taskq",
2305 			    __func__);
2306 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2307 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2308 			return;
2309 		}
2310 	} else {
2311 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2312 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2313 		    "task", __func__, port->p_instance);
2314 	}
2315 
2316 	mutex_exit(&port->state_lock);
2317 
2318 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2319 	    ldcp->ldc_id);
2320 	D1(vswp, "%s: exit", __func__);
2321 }
2322 
2323 /*
2324  * Process a VIO ctrl message. Invoked from taskq.
2325  */
2326 static void
2327 vsw_process_ctrl_pkt(void *arg)
2328 {
2329 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2330 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2331 	vsw_t 		*vswp = ldcp->ldc_vswp;
2332 	vio_msg_tag_t	tag;
2333 	uint16_t	env;
2334 
2335 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2336 
2337 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2338 	env = tag.vio_subtype_env;
2339 
2340 	/* stale pkt check */
2341 	if (ctaskp->hss_id < ldcp->hss_id) {
2342 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2343 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2344 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2345 		return;
2346 	}
2347 
2348 	/* session id check */
2349 	if (ldcp->session_status & VSW_PEER_SESSION) {
2350 		if (ldcp->peer_session != tag.vio_sid) {
2351 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2352 			    __func__, ldcp->ldc_id, tag.vio_sid);
2353 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2354 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2355 			return;
2356 		}
2357 	}
2358 
2359 	/*
2360 	 * Switch on vio_subtype envelope, then let lower routines
2361 	 * decide if its an INFO, ACK or NACK packet.
2362 	 */
2363 	switch (env) {
2364 	case VIO_VER_INFO:
2365 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2366 		break;
2367 	case VIO_DRING_REG:
2368 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2369 		break;
2370 	case VIO_DRING_UNREG:
2371 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2372 		break;
2373 	case VIO_ATTR_INFO:
2374 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2375 		break;
2376 	case VNET_MCAST_INFO:
2377 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2378 		break;
2379 	case VIO_RDX:
2380 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2381 		break;
2382 	case VIO_DDS_INFO:
2383 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2384 		break;
2385 	default:
2386 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2387 	}
2388 
2389 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2390 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2391 }
2392 
2393 /*
2394  * Version negotiation. We can end up here either because our peer
2395  * has responded to a handshake message we have sent it, or our peer
2396  * has initiated a handshake with us. If its the former then can only
2397  * be ACK or NACK, if its the later can only be INFO.
2398  *
2399  * If its an ACK we move to the next stage of the handshake, namely
2400  * attribute exchange. If its a NACK we see if we can specify another
2401  * version, if we can't we stop.
2402  *
2403  * If it is an INFO we reset all params associated with communication
2404  * in that direction over this channel (remember connection is
2405  * essentially 2 independent simplex channels).
2406  */
2407 void
2408 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2409 {
2410 	vio_ver_msg_t	*ver_pkt;
2411 	vsw_t 		*vswp = ldcp->ldc_vswp;
2412 
2413 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2414 
2415 	/*
2416 	 * We know this is a ctrl/version packet so
2417 	 * cast it into the correct structure.
2418 	 */
2419 	ver_pkt = (vio_ver_msg_t *)pkt;
2420 
2421 	switch (ver_pkt->tag.vio_subtype) {
2422 	case VIO_SUBTYPE_INFO:
2423 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2424 
2425 		/*
2426 		 * Record the session id, which we will use from now
2427 		 * until we see another VER_INFO msg. Even then the
2428 		 * session id in most cases will be unchanged, execpt
2429 		 * if channel was reset.
2430 		 */
2431 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2432 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2433 			DERR(vswp, "%s: updating session id for chan %lld "
2434 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2435 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2436 		}
2437 
2438 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2439 		ldcp->session_status |= VSW_PEER_SESSION;
2440 
2441 		/* Legal message at this time ? */
2442 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2443 			return;
2444 
2445 		/*
2446 		 * First check the device class. Currently only expect
2447 		 * to be talking to a network device. In the future may
2448 		 * also talk to another switch.
2449 		 */
2450 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2451 			DERR(vswp, "%s: illegal device class %d", __func__,
2452 			    ver_pkt->dev_class);
2453 
2454 			ver_pkt->tag.vio_sid = ldcp->local_session;
2455 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2456 
2457 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2458 
2459 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2460 			    sizeof (vio_ver_msg_t), B_TRUE);
2461 
2462 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2463 			vsw_next_milestone(ldcp);
2464 			return;
2465 		} else {
2466 			ldcp->dev_class = ver_pkt->dev_class;
2467 		}
2468 
2469 		/*
2470 		 * Now check the version.
2471 		 */
2472 		if (vsw_supported_version(ver_pkt) == 0) {
2473 			/*
2474 			 * Support this major version and possibly
2475 			 * adjusted minor version.
2476 			 */
2477 
2478 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2479 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2480 
2481 			/* Store accepted values */
2482 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2483 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2484 
2485 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2486 
2487 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2488 
2489 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2490 				/*
2491 				 * Send a version info message
2492 				 * using the accepted version that
2493 				 * we are about to ack. Also note that
2494 				 * we send our ver info before we ack.
2495 				 * Otherwise, as soon as receiving the
2496 				 * ack, obp sends attr info msg, which
2497 				 * breaks vsw_check_flag() invoked
2498 				 * from vsw_process_ctrl_attr_pkt();
2499 				 * as we also need VSW_VER_ACK_RECV to
2500 				 * be set in lane_out.lstate, before
2501 				 * we can receive attr info.
2502 				 */
2503 				vsw_send_ver(ldcp);
2504 			}
2505 		} else {
2506 			/*
2507 			 * NACK back with the next lower major/minor
2508 			 * pairing we support (if don't suuport any more
2509 			 * versions then they will be set to zero.
2510 			 */
2511 
2512 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2513 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2514 
2515 			/* Store updated values */
2516 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2517 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2518 
2519 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2520 
2521 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2522 		}
2523 
2524 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2525 		ver_pkt->tag.vio_sid = ldcp->local_session;
2526 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2527 		    sizeof (vio_ver_msg_t), B_TRUE);
2528 
2529 		vsw_next_milestone(ldcp);
2530 		break;
2531 
2532 	case VIO_SUBTYPE_ACK:
2533 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2534 
2535 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2536 			return;
2537 
2538 		/* Store updated values */
2539 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2540 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2541 
2542 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2543 		vsw_next_milestone(ldcp);
2544 
2545 		break;
2546 
2547 	case VIO_SUBTYPE_NACK:
2548 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2549 
2550 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2551 			return;
2552 
2553 		/*
2554 		 * If our peer sent us a NACK with the ver fields set to
2555 		 * zero then there is nothing more we can do. Otherwise see
2556 		 * if we support either the version suggested, or a lesser
2557 		 * one.
2558 		 */
2559 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2560 			DERR(vswp, "%s: peer unable to negotiate any "
2561 			    "further.", __func__);
2562 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2563 			vsw_next_milestone(ldcp);
2564 			return;
2565 		}
2566 
2567 		/*
2568 		 * Check to see if we support this major version or
2569 		 * a lower one. If we don't then maj/min will be set
2570 		 * to zero.
2571 		 */
2572 		(void) vsw_supported_version(ver_pkt);
2573 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2574 			/* Nothing more we can do */
2575 			DERR(vswp, "%s: version negotiation failed.\n",
2576 			    __func__);
2577 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2578 			vsw_next_milestone(ldcp);
2579 		} else {
2580 			/* found a supported major version */
2581 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2582 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2583 
2584 			D2(vswp, "%s: resending with updated values (%x, %x)",
2585 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2586 
2587 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2588 			ver_pkt->tag.vio_sid = ldcp->local_session;
2589 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2590 
2591 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2592 
2593 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2594 			    sizeof (vio_ver_msg_t), B_TRUE);
2595 
2596 			vsw_next_milestone(ldcp);
2597 
2598 		}
2599 		break;
2600 
2601 	default:
2602 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2603 		    ver_pkt->tag.vio_subtype);
2604 	}
2605 
2606 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2607 }
2608 
2609 /*
2610  * Process an attribute packet. We can end up here either because our peer
2611  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2612  * peer has sent us an attribute INFO message
2613  *
2614  * If its an ACK we then move to the next stage of the handshake which
2615  * is to send our descriptor ring info to our peer. If its a NACK then
2616  * there is nothing more we can (currently) do.
2617  *
2618  * If we get a valid/acceptable INFO packet (and we have already negotiated
2619  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2620  * NACK back and reset channel state to INACTIV.
2621  *
2622  * FUTURE: in time we will probably negotiate over attributes, but for
2623  * the moment unacceptable attributes are regarded as a fatal error.
2624  *
2625  */
2626 void
2627 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2628 {
2629 	vnet_attr_msg_t		*attr_pkt;
2630 	vsw_t			*vswp = ldcp->ldc_vswp;
2631 	vsw_port_t		*port = ldcp->ldc_port;
2632 	uint64_t		macaddr = 0;
2633 	lane_t			*lane_out = &ldcp->lane_out;
2634 	lane_t			*lane_in = &ldcp->lane_in;
2635 	uint32_t		mtu;
2636 	boolean_t		ack = B_TRUE;
2637 	int			i;
2638 
2639 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2640 
2641 	/*
2642 	 * We know this is a ctrl/attr packet so
2643 	 * cast it into the correct structure.
2644 	 */
2645 	attr_pkt = (vnet_attr_msg_t *)pkt;
2646 
2647 	switch (attr_pkt->tag.vio_subtype) {
2648 	case VIO_SUBTYPE_INFO:
2649 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2650 
2651 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2652 			return;
2653 
2654 		/*
2655 		 * If the attributes are unacceptable then we NACK back.
2656 		 */
2657 		if (vsw_check_attr(attr_pkt, ldcp)) {
2658 			ack = B_FALSE;
2659 
2660 			DERR(vswp, "%s (chan %d): invalid attributes",
2661 			    __func__, ldcp->ldc_id);
2662 
2663 		} else {
2664 
2665 			if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2666 				/*
2667 				 * Versions >= 1.4:
2668 				 * The mtu is negotiated down to the
2669 				 * minimum of our mtu and peer's mtu.
2670 				 */
2671 				mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);
2672 
2673 				/*
2674 				 * If we have received an ack for the attr info
2675 				 * that we sent, then check if the mtu computed
2676 				 * above matches the mtu that the peer had ack'd
2677 				 * (saved in local hparams). If they don't
2678 				 * match, we fail the handshake.
2679 				 */
2680 				if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2681 					if (mtu != lane_out->mtu) {
2682 						/* send NACK */
2683 						ack = B_FALSE;
2684 					}
2685 				} else {
2686 					/*
2687 					 * Save the mtu computed above in our
2688 					 * attr parameters, so it gets sent in
2689 					 * the attr info from us to the peer.
2690 					 */
2691 					lane_out->mtu = mtu;
2692 				}
2693 			}
2694 
2695 		}
2696 
2697 		if (ack == B_FALSE) {
2698 
2699 			vsw_free_lane_resources(ldcp, INBOUND);
2700 
2701 			attr_pkt->tag.vio_sid = ldcp->local_session;
2702 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2703 
2704 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2705 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2706 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2707 			    sizeof (vnet_attr_msg_t), B_TRUE);
2708 
2709 			vsw_next_milestone(ldcp);
2710 			return;
2711 		}
2712 
2713 		/*
2714 		 * Otherwise store attributes for this lane and update
2715 		 * lane state.
2716 		 */
2717 		lane_in->mtu = attr_pkt->mtu;
2718 		lane_in->addr = attr_pkt->addr;
2719 		lane_in->addr_type = attr_pkt->addr_type;
2720 		lane_in->xfer_mode = attr_pkt->xfer_mode;
2721 		lane_in->ack_freq = attr_pkt->ack_freq;
2722 
2723 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2724 			/* save the MIN mtu in the msg to be replied */
2725 			attr_pkt->mtu = mtu;
2726 		}
2727 
2728 		macaddr = lane_in->addr;
2729 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2730 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2731 			macaddr >>= 8;
2732 		}
2733 
2734 		/* create the fdb entry for this port/mac address */
2735 		vsw_fdbe_add(vswp, port);
2736 
2737 		/* add the port to the specified vlans */
2738 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2739 
2740 		/* setup device specifc xmit routines */
2741 		mutex_enter(&port->tx_lock);
2742 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2743 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2744 		    (VSW_VER_LT(ldcp, 1, 2) &&
2745 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2746 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2747 			port->transmit = vsw_dringsend;
2748 		} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2749 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2750 			vsw_create_privring(ldcp);
2751 			port->transmit = vsw_descrsend;
2752 			lane_out->xfer_mode = VIO_DESC_MODE;
2753 		}
2754 
2755 		/*
2756 		 * HybridIO is supported only vnet, not by OBP.
2757 		 * So, set hio_capable to true only when in DRING mode.
2758 		 */
2759 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2760 		    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2761 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2762 		} else {
2763 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2764 		}
2765 
2766 		mutex_exit(&port->tx_lock);
2767 
2768 		attr_pkt->tag.vio_sid = ldcp->local_session;
2769 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2770 
2771 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2772 
2773 		lane_in->lstate |= VSW_ATTR_ACK_SENT;
2774 
2775 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2776 		    sizeof (vnet_attr_msg_t), B_TRUE);
2777 
2778 		vsw_next_milestone(ldcp);
2779 		break;
2780 
2781 	case VIO_SUBTYPE_ACK:
2782 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2783 
2784 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2785 			return;
2786 
2787 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2788 			/*
2789 			 * Versions >= 1.4:
2790 			 * The ack msg sent by the peer contains the minimum of
2791 			 * our mtu (that we had sent in our attr info) and the
2792 			 * peer's mtu.
2793 			 *
2794 			 * If we have sent an ack for the attr info msg from
2795 			 * the peer, check if the mtu that was computed then
2796 			 * (saved in lane_out params) matches the mtu that the
2797 			 * peer has ack'd. If they don't match, we fail the
2798 			 * handshake.
2799 			 */
2800 			if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2801 				if (lane_out->mtu != attr_pkt->mtu) {
2802 					return;
2803 				}
2804 			} else {
2805 				/*
2806 				 * If the mtu ack'd by the peer is > our mtu
2807 				 * fail handshake. Otherwise, save the mtu, so
2808 				 * we can validate it when we receive attr info
2809 				 * from our peer.
2810 				 */
2811 				if (attr_pkt->mtu > lane_out->mtu) {
2812 					return;
2813 				}
2814 				if (attr_pkt->mtu <= lane_out->mtu) {
2815 					lane_out->mtu = attr_pkt->mtu;
2816 				}
2817 			}
2818 		}
2819 
2820 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2821 		vsw_next_milestone(ldcp);
2822 		break;
2823 
2824 	case VIO_SUBTYPE_NACK:
2825 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2826 
2827 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2828 			return;
2829 
2830 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2831 		vsw_next_milestone(ldcp);
2832 		break;
2833 
2834 	default:
2835 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2836 		    attr_pkt->tag.vio_subtype);
2837 	}
2838 
2839 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2840 }
2841 
2842 /*
2843  * Process a dring info packet. We can end up here either because our peer
2844  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2845  * peer has sent us a dring INFO message.
2846  *
2847  * If we get a valid/acceptable INFO packet (and we have already negotiated
2848  * a version) we ACK back and update the lane state, otherwise we NACK back.
2849  *
2850  * FUTURE: nothing to stop client from sending us info on multiple dring's
2851  * but for the moment we will just use the first one we are given.
2852  *
2853  */
2854 void
2855 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2856 {
2857 	vio_dring_reg_msg_t	*dring_pkt;
2858 	vsw_t			*vswp = ldcp->ldc_vswp;
2859 	ldc_mem_info_t		minfo;
2860 	dring_info_t		*dp, *dbp;
2861 	int			dring_found = 0;
2862 
2863 	/*
2864 	 * We know this is a ctrl/dring packet so
2865 	 * cast it into the correct structure.
2866 	 */
2867 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2868 
2869 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2870 
2871 	switch (dring_pkt->tag.vio_subtype) {
2872 	case VIO_SUBTYPE_INFO:
2873 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2874 
2875 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2876 			return;
2877 
2878 		/*
2879 		 * If the dring params are unacceptable then we NACK back.
2880 		 */
2881 		if (vsw_check_dring_info(dring_pkt)) {
2882 
2883 			DERR(vswp, "%s (%lld): invalid dring info",
2884 			    __func__, ldcp->ldc_id);
2885 
2886 			vsw_free_lane_resources(ldcp, INBOUND);
2887 
2888 			dring_pkt->tag.vio_sid = ldcp->local_session;
2889 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2890 
2891 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2892 
2893 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2894 
2895 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2896 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2897 
2898 			vsw_next_milestone(ldcp);
2899 			return;
2900 		}
2901 
2902 		/*
2903 		 * Otherwise, attempt to map in the dring using the
2904 		 * cookie. If that succeeds we send back a unique dring
2905 		 * identifier that the sending side will use in future
2906 		 * to refer to this descriptor ring.
2907 		 */
2908 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2909 
2910 		dp->num_descriptors = dring_pkt->num_descriptors;
2911 		dp->descriptor_size = dring_pkt->descriptor_size;
2912 		dp->options = dring_pkt->options;
2913 		dp->ncookies = dring_pkt->ncookies;
2914 
2915 		/*
2916 		 * Note: should only get one cookie. Enforced in
2917 		 * the ldc layer.
2918 		 */
2919 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2920 		    sizeof (ldc_mem_cookie_t));
2921 
2922 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2923 		    dp->num_descriptors, dp->descriptor_size);
2924 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2925 		    dp->options, dp->ncookies);
2926 
2927 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2928 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2929 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
2930 
2931 			DERR(vswp, "%s: dring_map failed\n", __func__);
2932 
2933 			kmem_free(dp, sizeof (dring_info_t));
2934 			vsw_free_lane_resources(ldcp, INBOUND);
2935 
2936 			dring_pkt->tag.vio_sid = ldcp->local_session;
2937 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2938 
2939 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2940 
2941 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2942 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2943 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2944 
2945 			vsw_next_milestone(ldcp);
2946 			return;
2947 		}
2948 
2949 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2950 
2951 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2952 
2953 			kmem_free(dp, sizeof (dring_info_t));
2954 			vsw_free_lane_resources(ldcp, INBOUND);
2955 
2956 			dring_pkt->tag.vio_sid = ldcp->local_session;
2957 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2958 
2959 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2960 
2961 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2962 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2963 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2964 
2965 			vsw_next_milestone(ldcp);
2966 			return;
2967 		} else {
2968 			/* store the address of the pub part of ring */
2969 			dp->pub_addr = minfo.vaddr;
2970 
2971 			/* cache the dring mtype */
2972 			dp->dring_mtype = minfo.mtype;
2973 		}
2974 
2975 		/* no private section as we are importing */
2976 		dp->priv_addr = NULL;
2977 
2978 		/*
2979 		 * Using simple mono increasing int for ident at
2980 		 * the moment.
2981 		 */
2982 		dp->ident = ldcp->next_ident;
2983 		ldcp->next_ident++;
2984 
2985 		dp->end_idx = 0;
2986 		dp->next = NULL;
2987 
2988 		/*
2989 		 * Link it onto the end of the list of drings
2990 		 * for this lane.
2991 		 */
2992 		if (ldcp->lane_in.dringp == NULL) {
2993 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2994 			ldcp->lane_in.dringp = dp;
2995 		} else {
2996 			dbp = ldcp->lane_in.dringp;
2997 
2998 			while (dbp->next != NULL)
2999 				dbp = dbp->next;
3000 
3001 			dbp->next = dp;
3002 		}
3003 
3004 		/* acknowledge it */
3005 		dring_pkt->tag.vio_sid = ldcp->local_session;
3006 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3007 		dring_pkt->dring_ident = dp->ident;
3008 
3009 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3010 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
3011 
3012 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3013 		vsw_next_milestone(ldcp);
3014 		break;
3015 
3016 	case VIO_SUBTYPE_ACK:
3017 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3018 
3019 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3020 			return;
3021 
3022 		/*
3023 		 * Peer is acknowledging our dring info and will have
3024 		 * sent us a dring identifier which we will use to
3025 		 * refer to this ring w.r.t. our peer.
3026 		 */
3027 		dp = ldcp->lane_out.dringp;
3028 		if (dp != NULL) {
3029 			/*
3030 			 * Find the ring this ident should be associated
3031 			 * with.
3032 			 */
3033 			if (vsw_dring_match(dp, dring_pkt)) {
3034 				dring_found = 1;
3035 
3036 			} else while (dp != NULL) {
3037 				if (vsw_dring_match(dp, dring_pkt)) {
3038 					dring_found = 1;
3039 					break;
3040 				}
3041 				dp = dp->next;
3042 			}
3043 
3044 			if (dring_found == 0) {
3045 				DERR(NULL, "%s: unrecognised ring cookie",
3046 				    __func__);
3047 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3048 				return;
3049 			}
3050 
3051 		} else {
3052 			DERR(vswp, "%s: DRING ACK received but no drings "
3053 			    "allocated", __func__);
3054 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3055 			return;
3056 		}
3057 
3058 		/* store ident */
3059 		dp->ident = dring_pkt->dring_ident;
3060 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3061 		vsw_next_milestone(ldcp);
3062 		break;
3063 
3064 	case VIO_SUBTYPE_NACK:
3065 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3066 
3067 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3068 			return;
3069 
3070 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3071 		vsw_next_milestone(ldcp);
3072 		break;
3073 
3074 	default:
3075 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3076 		    dring_pkt->tag.vio_subtype);
3077 	}
3078 
3079 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3080 }
3081 
3082 /*
3083  * Process a request from peer to unregister a dring.
3084  *
3085  * For the moment we just restart the handshake if our
3086  * peer endpoint attempts to unregister a dring.
3087  */
3088 void
3089 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3090 {
3091 	vsw_t			*vswp = ldcp->ldc_vswp;
3092 	vio_dring_unreg_msg_t	*dring_pkt;
3093 
3094 	/*
3095 	 * We know this is a ctrl/dring packet so
3096 	 * cast it into the correct structure.
3097 	 */
3098 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3099 
3100 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3101 
3102 	switch (dring_pkt->tag.vio_subtype) {
3103 	case VIO_SUBTYPE_INFO:
3104 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3105 
3106 		DWARN(vswp, "%s: restarting handshake..", __func__);
3107 		break;
3108 
3109 	case VIO_SUBTYPE_ACK:
3110 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3111 
3112 		DWARN(vswp, "%s: restarting handshake..", __func__);
3113 		break;
3114 
3115 	case VIO_SUBTYPE_NACK:
3116 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3117 
3118 		DWARN(vswp, "%s: restarting handshake..", __func__);
3119 		break;
3120 
3121 	default:
3122 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3123 		    dring_pkt->tag.vio_subtype);
3124 	}
3125 
3126 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3127 
3128 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3129 }
3130 
3131 #define	SND_MCST_NACK(ldcp, pkt) \
3132 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3133 	pkt->tag.vio_sid = ldcp->local_session; \
3134 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3135 			sizeof (vnet_mcast_msg_t), B_TRUE);
3136 
3137 /*
3138  * Process a multicast request from a vnet.
3139  *
3140  * Vnet's specify a multicast address that they are interested in. This
3141  * address is used as a key into the hash table which forms the multicast
3142  * forwarding database (mFDB).
3143  *
3144  * The table keys are the multicast addresses, while the table entries
3145  * are pointers to lists of ports which wish to receive packets for the
3146  * specified multicast address.
3147  *
3148  * When a multicast packet is being switched we use the address as a key
3149  * into the hash table, and then walk the appropriate port list forwarding
3150  * the pkt to each port in turn.
3151  *
3152  * If a vnet is no longer interested in a particular multicast grouping
3153  * we simply find the correct location in the hash table and then delete
3154  * the relevant port from the port list.
3155  *
3156  * To deal with the case whereby a port is being deleted without first
3157  * removing itself from the lists in the hash table, we maintain a list
3158  * of multicast addresses the port has registered an interest in, within
3159  * the port structure itself. We then simply walk that list of addresses
3160  * using them as keys into the hash table and remove the port from the
3161  * appropriate lists.
3162  */
3163 static void
3164 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3165 {
3166 	vnet_mcast_msg_t	*mcst_pkt;
3167 	vsw_port_t		*port = ldcp->ldc_port;
3168 	vsw_t			*vswp = ldcp->ldc_vswp;
3169 	int			i;
3170 
3171 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3172 
3173 	/*
3174 	 * We know this is a ctrl/mcast packet so
3175 	 * cast it into the correct structure.
3176 	 */
3177 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3178 
3179 	switch (mcst_pkt->tag.vio_subtype) {
3180 	case VIO_SUBTYPE_INFO:
3181 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3182 
3183 		/*
3184 		 * Check if in correct state to receive a multicast
3185 		 * message (i.e. handshake complete). If not reset
3186 		 * the handshake.
3187 		 */
3188 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3189 			return;
3190 
3191 		/*
3192 		 * Before attempting to add or remove address check
3193 		 * that they are valid multicast addresses.
3194 		 * If not, then NACK back.
3195 		 */
3196 		for (i = 0; i < mcst_pkt->count; i++) {
3197 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3198 				DERR(vswp, "%s: invalid multicast address",
3199 				    __func__);
3200 				SND_MCST_NACK(ldcp, mcst_pkt);
3201 				return;
3202 			}
3203 		}
3204 
3205 		/*
3206 		 * Now add/remove the addresses. If this fails we
3207 		 * NACK back.
3208 		 */
3209 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3210 			SND_MCST_NACK(ldcp, mcst_pkt);
3211 			return;
3212 		}
3213 
3214 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3215 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3216 
3217 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3218 
3219 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3220 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3221 		break;
3222 
3223 	case VIO_SUBTYPE_ACK:
3224 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3225 
3226 		/*
3227 		 * We shouldn't ever get a multicast ACK message as
3228 		 * at the moment we never request multicast addresses
3229 		 * to be set on some other device. This may change in
3230 		 * the future if we have cascading switches.
3231 		 */
3232 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3233 			return;
3234 
3235 				/* Do nothing */
3236 		break;
3237 
3238 	case VIO_SUBTYPE_NACK:
3239 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3240 
3241 		/*
3242 		 * We shouldn't get a multicast NACK packet for the
3243 		 * same reasons as we shouldn't get a ACK packet.
3244 		 */
3245 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3246 			return;
3247 
3248 				/* Do nothing */
3249 		break;
3250 
3251 	default:
3252 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3253 		    mcst_pkt->tag.vio_subtype);
3254 	}
3255 
3256 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3257 }
3258 
3259 static void
3260 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3261 {
3262 	vio_rdx_msg_t	*rdx_pkt;
3263 	vsw_t		*vswp = ldcp->ldc_vswp;
3264 
3265 	/*
3266 	 * We know this is a ctrl/rdx packet so
3267 	 * cast it into the correct structure.
3268 	 */
3269 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3270 
3271 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3272 
3273 	switch (rdx_pkt->tag.vio_subtype) {
3274 	case VIO_SUBTYPE_INFO:
3275 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3276 
3277 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3278 			return;
3279 
3280 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3281 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3282 
3283 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3284 
3285 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3286 
3287 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3288 		    sizeof (vio_rdx_msg_t), B_TRUE);
3289 
3290 		vsw_next_milestone(ldcp);
3291 		break;
3292 
3293 	case VIO_SUBTYPE_ACK:
3294 		/*
3295 		 * Should be handled in-band by callback handler.
3296 		 */
3297 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3298 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3299 		break;
3300 
3301 	case VIO_SUBTYPE_NACK:
3302 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3303 
3304 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3305 			return;
3306 
3307 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3308 		vsw_next_milestone(ldcp);
3309 		break;
3310 
3311 	default:
3312 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3313 		    rdx_pkt->tag.vio_subtype);
3314 	}
3315 
3316 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3317 }
3318 
3319 static void
3320 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3321 	uint32_t msglen)
3322 {
3323 	uint16_t	env = tagp->vio_subtype_env;
3324 	vsw_t		*vswp = ldcp->ldc_vswp;
3325 
3326 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3327 
3328 	/* session id check */
3329 	if (ldcp->session_status & VSW_PEER_SESSION) {
3330 		if (ldcp->peer_session != tagp->vio_sid) {
3331 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3332 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3333 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3334 			return;
3335 		}
3336 	}
3337 
3338 	/*
3339 	 * It is an error for us to be getting data packets
3340 	 * before the handshake has completed.
3341 	 */
3342 	if (ldcp->hphase != VSW_MILESTONE4) {
3343 		DERR(vswp, "%s: got data packet before handshake complete "
3344 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3345 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3346 		DUMP_FLAGS(ldcp->lane_in.lstate);
3347 		DUMP_FLAGS(ldcp->lane_out.lstate);
3348 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3349 		return;
3350 	}
3351 
3352 	/*
3353 	 * To reduce the locking contention, release the
3354 	 * ldc_cblock here and re-acquire it once we are done
3355 	 * receiving packets.
3356 	 */
3357 	mutex_exit(&ldcp->ldc_cblock);
3358 	mutex_enter(&ldcp->ldc_rxlock);
3359 
3360 	/*
3361 	 * Switch on vio_subtype envelope, then let lower routines
3362 	 * decide if its an INFO, ACK or NACK packet.
3363 	 */
3364 	if (env == VIO_DRING_DATA) {
3365 		vsw_process_data_dring_pkt(ldcp, dpkt);
3366 	} else if (env == VIO_PKT_DATA) {
3367 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3368 	} else if (env == VIO_DESC_DATA) {
3369 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3370 	} else {
3371 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3372 	}
3373 
3374 	mutex_exit(&ldcp->ldc_rxlock);
3375 	mutex_enter(&ldcp->ldc_cblock);
3376 
3377 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3378 }
3379 
3380 #define	SND_DRING_NACK(ldcp, pkt) \
3381 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3382 	pkt->tag.vio_sid = ldcp->local_session; \
3383 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3384 			sizeof (vio_dring_msg_t), B_TRUE);
3385 
3386 static void
3387 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3388 {
3389 	vio_dring_msg_t		*dring_pkt;
3390 	vnet_public_desc_t	desc, *pub_addr = NULL;
3391 	vsw_private_desc_t	*priv_addr = NULL;
3392 	dring_info_t		*dp = NULL;
3393 	vsw_t			*vswp = ldcp->ldc_vswp;
3394 	mblk_t			*mp = NULL;
3395 	mblk_t			*bp = NULL;
3396 	mblk_t			*bpt = NULL;
3397 	size_t			nbytes = 0;
3398 	uint64_t		chain = 0;
3399 	uint64_t		len;
3400 	uint32_t		pos, start;
3401 	uint32_t		range_start, range_end;
3402 	int32_t			end, num, cnt = 0;
3403 	int			i, rv, rng_rv = 0, msg_rv = 0;
3404 	boolean_t		prev_desc_ack = B_FALSE;
3405 	int			read_attempts = 0;
3406 	struct ether_header	*ehp;
3407 	lane_t			*lp = &ldcp->lane_out;
3408 
3409 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3410 
3411 	/*
3412 	 * We know this is a data/dring packet so
3413 	 * cast it into the correct structure.
3414 	 */
3415 	dring_pkt = (vio_dring_msg_t *)dpkt;
3416 
3417 	/*
3418 	 * Switch on the vio_subtype. If its INFO then we need to
3419 	 * process the data. If its an ACK we need to make sure
3420 	 * it makes sense (i.e did we send an earlier data/info),
3421 	 * and if its a NACK then we maybe attempt a retry.
3422 	 */
3423 	switch (dring_pkt->tag.vio_subtype) {
3424 	case VIO_SUBTYPE_INFO:
3425 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3426 
3427 		READ_ENTER(&ldcp->lane_in.dlistrw);
3428 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3429 		    dring_pkt->dring_ident)) == NULL) {
3430 			RW_EXIT(&ldcp->lane_in.dlistrw);
3431 
3432 			DERR(vswp, "%s(%lld): unable to find dring from "
3433 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3434 			    dring_pkt->dring_ident);
3435 
3436 			SND_DRING_NACK(ldcp, dring_pkt);
3437 			return;
3438 		}
3439 
3440 		start = pos = dring_pkt->start_idx;
3441 		end = dring_pkt->end_idx;
3442 		len = dp->num_descriptors;
3443 
3444 		range_start = range_end = pos;
3445 
3446 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3447 		    __func__, ldcp->ldc_id, start, end);
3448 
3449 		if (end == -1) {
3450 			num = -1;
3451 		} else if (end >= 0) {
3452 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3453 
3454 			/* basic sanity check */
3455 			if (end > len) {
3456 				RW_EXIT(&ldcp->lane_in.dlistrw);
3457 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3458 				    "ring length %lld", __func__,
3459 				    ldcp->ldc_id, end, len);
3460 
3461 				SND_DRING_NACK(ldcp, dring_pkt);
3462 				return;
3463 			}
3464 		} else {
3465 			RW_EXIT(&ldcp->lane_in.dlistrw);
3466 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3467 			    __func__, ldcp->ldc_id, end);
3468 			SND_DRING_NACK(ldcp, dring_pkt);
3469 			return;
3470 		}
3471 
3472 		while (cnt != num) {
3473 vsw_recheck_desc:
3474 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3475 
3476 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3477 			    &desc, dp->dring_mtype, dp->handle,
3478 			    pos, pos)) != 0) {
3479 				DERR(vswp, "%s(%lld): unable to copy "
3480 				    "descriptor at pos %d: err %d",
3481 				    __func__, pos, ldcp->ldc_id, rng_rv);
3482 				ldcp->ldc_stats.ierrors++;
3483 				break;
3484 			}
3485 
3486 			/*
3487 			 * When given a bounded range of descriptors
3488 			 * to process, its an error to hit a descriptor
3489 			 * which is not ready. In the non-bounded case
3490 			 * (end_idx == -1) this simply indicates we have
3491 			 * reached the end of the current active range.
3492 			 */
3493 			if (desc.hdr.dstate != VIO_DESC_READY) {
3494 				/* unbound - no error */
3495 				if (end == -1) {
3496 					if (read_attempts == vsw_read_attempts)
3497 						break;
3498 
3499 					delay(drv_usectohz(vsw_desc_delay));
3500 					read_attempts++;
3501 					goto vsw_recheck_desc;
3502 				}
3503 
3504 				/* bounded - error - so NACK back */
3505 				RW_EXIT(&ldcp->lane_in.dlistrw);
3506 				DERR(vswp, "%s(%lld): descriptor not READY "
3507 				    "(%d)", __func__, ldcp->ldc_id,
3508 				    desc.hdr.dstate);
3509 				SND_DRING_NACK(ldcp, dring_pkt);
3510 				return;
3511 			}
3512 
3513 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3514 
3515 			range_end = pos;
3516 
3517 			/*
3518 			 * If we ACK'd the previous descriptor then now
3519 			 * record the new range start position for later
3520 			 * ACK's.
3521 			 */
3522 			if (prev_desc_ack) {
3523 				range_start = pos;
3524 
3525 				D2(vswp, "%s(%lld): updating range start to be "
3526 				    "%d", __func__, ldcp->ldc_id, range_start);
3527 
3528 				prev_desc_ack = B_FALSE;
3529 			}
3530 
3531 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3532 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3533 			    __func__, ldcp->ldc_id, pos, &desc,
3534 			    desc.hdr.dstate, desc.nbytes);
3535 
3536 			if ((desc.nbytes < ETHERMIN) ||
3537 			    (desc.nbytes > lp->mtu)) {
3538 				/* invalid size; drop the packet */
3539 				ldcp->ldc_stats.ierrors++;
3540 				goto vsw_process_desc_done;
3541 			}
3542 
3543 			/*
3544 			 * Ensure that we ask ldc for an aligned
3545 			 * number of bytes. Data is padded to align on 8
3546 			 * byte boundary, desc.nbytes is actual data length,
3547 			 * i.e. minus that padding.
3548 			 */
3549 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3550 			if (nbytes > ldcp->max_rxpool_size) {
3551 				mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3552 				    BPRI_MED);
3553 			} else {
3554 				mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3555 				if (mp == NULL) {
3556 					ldcp->ldc_stats.rx_vio_allocb_fail++;
3557 					/*
3558 					 * No free receive buffers available,
3559 					 * so fallback onto allocb(9F). Make
3560 					 * sure that we get a data buffer which
3561 					 * is a multiple of 8 as this is
3562 					 * required by ldc_mem_copy.
3563 					 */
3564 					DTRACE_PROBE(allocb);
3565 					mp = allocb(desc.nbytes +
3566 					    VNET_IPALIGN + 8, BPRI_MED);
3567 				}
3568 			}
3569 			if (mp == NULL) {
3570 				DERR(vswp, "%s(%ld): allocb failed",
3571 				    __func__, ldcp->ldc_id);
3572 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3573 				    dp->dring_mtype, dp->handle, pos, pos,
3574 				    VIO_DESC_DONE);
3575 				ldcp->ldc_stats.ierrors++;
3576 				ldcp->ldc_stats.rx_allocb_fail++;
3577 				break;
3578 			}
3579 
3580 			rv = ldc_mem_copy(ldcp->ldc_handle,
3581 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3582 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3583 			if (rv != 0) {
3584 				DERR(vswp, "%s(%d): unable to copy in data "
3585 				    "from %d cookies in desc %d (rv %d)",
3586 				    __func__, ldcp->ldc_id, desc.ncookies,
3587 				    pos, rv);
3588 				freemsg(mp);
3589 
3590 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3591 				    dp->dring_mtype, dp->handle, pos, pos,
3592 				    VIO_DESC_DONE);
3593 				ldcp->ldc_stats.ierrors++;
3594 				break;
3595 			} else {
3596 				D2(vswp, "%s(%d): copied in %ld bytes"
3597 				    " using %d cookies", __func__,
3598 				    ldcp->ldc_id, nbytes, desc.ncookies);
3599 			}
3600 
3601 			/* adjust the read pointer to skip over the padding */
3602 			mp->b_rptr += VNET_IPALIGN;
3603 
3604 			/* point to the actual end of data */
3605 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3606 
3607 			/* update statistics */
3608 			ehp = (struct ether_header *)mp->b_rptr;
3609 			if (IS_BROADCAST(ehp))
3610 				ldcp->ldc_stats.brdcstrcv++;
3611 			else if (IS_MULTICAST(ehp))
3612 				ldcp->ldc_stats.multircv++;
3613 
3614 			ldcp->ldc_stats.ipackets++;
3615 			ldcp->ldc_stats.rbytes += desc.nbytes;
3616 
3617 			/*
3618 			 * IPALIGN space can be used for VLAN_TAG
3619 			 */
3620 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3621 			    VSW_VNETPORT, mp);
3622 
3623 			/* build a chain of received packets */
3624 			if (bp == NULL) {
3625 				/* first pkt */
3626 				bp = mp;
3627 				bp->b_next = bp->b_prev = NULL;
3628 				bpt = bp;
3629 				chain = 1;
3630 			} else {
3631 				mp->b_next = mp->b_prev = NULL;
3632 				bpt->b_next = mp;
3633 				bpt = mp;
3634 				chain++;
3635 			}
3636 
3637 vsw_process_desc_done:
3638 			/* mark we are finished with this descriptor */
3639 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3640 			    dp->dring_mtype, dp->handle, pos, pos,
3641 			    VIO_DESC_DONE)) != 0) {
3642 				DERR(vswp, "%s(%lld): unable to update "
3643 				    "dstate at pos %d: err %d",
3644 				    __func__, pos, ldcp->ldc_id, rng_rv);
3645 				ldcp->ldc_stats.ierrors++;
3646 				break;
3647 			}
3648 
3649 			/*
3650 			 * Send an ACK back to peer if requested.
3651 			 */
3652 			if (desc.hdr.ack) {
3653 				dring_pkt->start_idx = range_start;
3654 				dring_pkt->end_idx = range_end;
3655 
3656 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3657 				    " requested", __func__, ldcp->ldc_id,
3658 				    dring_pkt->start_idx, dring_pkt->end_idx);
3659 
3660 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3661 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3662 				dring_pkt->tag.vio_sid = ldcp->local_session;
3663 
3664 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3665 				    sizeof (vio_dring_msg_t), B_FALSE);
3666 
3667 				/*
3668 				 * Check if ACK was successfully sent. If not
3669 				 * we break and deal with that below.
3670 				 */
3671 				if (msg_rv != 0)
3672 					break;
3673 
3674 				prev_desc_ack = B_TRUE;
3675 				range_start = pos;
3676 			}
3677 
3678 			/* next descriptor */
3679 			pos = (pos + 1) % len;
3680 			cnt++;
3681 
3682 			/*
3683 			 * Break out of loop here and stop processing to
3684 			 * allow some other network device (or disk) to
3685 			 * get access to the cpu.
3686 			 */
3687 			if (chain > vsw_chain_len) {
3688 				D3(vswp, "%s(%lld): switching chain of %d "
3689 				    "msgs", __func__, ldcp->ldc_id, chain);
3690 				break;
3691 			}
3692 		}
3693 		RW_EXIT(&ldcp->lane_in.dlistrw);
3694 
3695 		/* send the chain of packets to be switched */
3696 		if (bp != NULL) {
3697 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3698 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3699 			    __func__, ldcp->ldc_id, chain);
3700 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3701 			    ldcp->ldc_port, NULL);
3702 		}
3703 
3704 		/*
3705 		 * If when we encountered an error when attempting to
3706 		 * access an imported dring, initiate a connection reset.
3707 		 */
3708 		if (rng_rv != 0) {
3709 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3710 			break;
3711 		}
3712 
3713 		/*
3714 		 * If when we attempted to send the ACK we found that the
3715 		 * channel had been reset then now handle this. We deal with
3716 		 * it here as we cannot reset the channel while holding the
3717 		 * dlistrw lock, and we don't want to acquire/release it
3718 		 * continuously in the above loop, as a channel reset should
3719 		 * be a rare event.
3720 		 */
3721 		if (msg_rv == ECONNRESET) {
3722 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3723 			break;
3724 		}
3725 
3726 		DTRACE_PROBE1(msg_cnt, int, cnt);
3727 
3728 		/*
3729 		 * We are now finished so ACK back with the state
3730 		 * set to STOPPING so our peer knows we are finished
3731 		 */
3732 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3733 		dring_pkt->tag.vio_sid = ldcp->local_session;
3734 
3735 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3736 
3737 		DTRACE_PROBE(stop_process_sent);
3738 
3739 		/*
3740 		 * We have not processed any more descriptors beyond
3741 		 * the last one we ACK'd.
3742 		 */
3743 		if (prev_desc_ack)
3744 			range_start = range_end;
3745 
3746 		dring_pkt->start_idx = range_start;
3747 		dring_pkt->end_idx = range_end;
3748 
3749 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3750 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3751 		    dring_pkt->end_idx);
3752 
3753 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3754 		    sizeof (vio_dring_msg_t), B_TRUE);
3755 		break;
3756 
3757 	case VIO_SUBTYPE_ACK:
3758 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3759 		/*
3760 		 * Verify that the relevant descriptors are all
3761 		 * marked as DONE
3762 		 */
3763 		READ_ENTER(&ldcp->lane_out.dlistrw);
3764 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3765 		    dring_pkt->dring_ident)) == NULL) {
3766 			RW_EXIT(&ldcp->lane_out.dlistrw);
3767 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3768 			return;
3769 		}
3770 
3771 		start = end = 0;
3772 		start = dring_pkt->start_idx;
3773 		end = dring_pkt->end_idx;
3774 		len = dp->num_descriptors;
3775 
3776 
3777 		mutex_enter(&dp->dlock);
3778 		dp->last_ack_recv = end;
3779 		ldcp->ldc_stats.dring_data_acks++;
3780 		mutex_exit(&dp->dlock);
3781 
3782 		(void) vsw_reclaim_dring(dp, start);
3783 
3784 		/*
3785 		 * If our peer is stopping processing descriptors then
3786 		 * we check to make sure it has processed all the descriptors
3787 		 * we have updated. If not then we send it a new message
3788 		 * to prompt it to restart.
3789 		 */
3790 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3791 			DTRACE_PROBE(stop_process_recv);
3792 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3793 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3794 			    dring_pkt->end_idx);
3795 
3796 			/*
3797 			 * Check next descriptor in public section of ring.
3798 			 * If its marked as READY then we need to prompt our
3799 			 * peer to start processing the ring again.
3800 			 */
3801 			i = (end + 1) % len;
3802 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3803 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3804 
3805 			/*
3806 			 * Hold the restart lock across all of this to
3807 			 * make sure that its not possible for us to
3808 			 * decide that a msg needs to be sent in the future
3809 			 * but the sending code having already checked is
3810 			 * about to exit.
3811 			 */
3812 			mutex_enter(&dp->restart_lock);
3813 			ldcp->ldc_stats.dring_stopped_acks++;
3814 			mutex_enter(&priv_addr->dstate_lock);
3815 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3816 
3817 				mutex_exit(&priv_addr->dstate_lock);
3818 
3819 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3820 				dring_pkt->tag.vio_sid = ldcp->local_session;
3821 
3822 				dring_pkt->start_idx = (end + 1) % len;
3823 				dring_pkt->end_idx = -1;
3824 
3825 				D2(vswp, "%s(%lld) : sending restart msg:"
3826 				    " %d : %d", __func__, ldcp->ldc_id,
3827 				    dring_pkt->start_idx, dring_pkt->end_idx);
3828 
3829 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3830 				    sizeof (vio_dring_msg_t), B_FALSE);
3831 				ldcp->ldc_stats.dring_data_msgs++;
3832 
3833 			} else {
3834 				mutex_exit(&priv_addr->dstate_lock);
3835 				dp->restart_reqd = B_TRUE;
3836 			}
3837 			mutex_exit(&dp->restart_lock);
3838 		}
3839 		RW_EXIT(&ldcp->lane_out.dlistrw);
3840 
3841 		/* only do channel reset after dropping dlistrw lock */
3842 		if (msg_rv == ECONNRESET)
3843 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3844 
3845 		break;
3846 
3847 	case VIO_SUBTYPE_NACK:
3848 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3849 		    __func__, ldcp->ldc_id);
3850 		/*
3851 		 * Something is badly wrong if we are getting NACK's
3852 		 * for our data pkts. So reset the channel.
3853 		 */
3854 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3855 
3856 		break;
3857 
3858 	default:
3859 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3860 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3861 	}
3862 
3863 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3864 }
3865 
3866 /*
3867  * dummy pkt data handler function for vnet protocol version 1.0
3868  */
3869 static void
3870 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3871 {
3872 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3873 }
3874 
3875 /*
3876  * This function handles raw pkt data messages received over the channel.
3877  * Currently, only priority-eth-type frames are received through this mechanism.
3878  * In this case, the frame(data) is present within the message itself which
3879  * is copied into an mblk before switching it.
3880  */
3881 static void
3882 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3883 {
3884 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3885 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3886 	uint32_t		size;
3887 	mblk_t			*mp;
3888 	vsw_t			*vswp = ldcp->ldc_vswp;
3889 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3890 	lane_t			*lp = &ldcp->lane_out;
3891 
3892 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3893 	if (size < ETHERMIN || size > lp->mtu) {
3894 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3895 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3896 		    ldcp->ldc_id, size);
3897 		return;
3898 	}
3899 
3900 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3901 	if (mp == NULL) {
3902 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3903 		if (mp == NULL) {
3904 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3905 			DWARN(vswp, "%s(%lld) allocb failure, "
3906 			    "unable to process priority frame\n", __func__,
3907 			    ldcp->ldc_id);
3908 			return;
3909 		}
3910 	}
3911 
3912 	/* skip over the extra space for vlan tag */
3913 	mp->b_rptr += VLAN_TAGSZ;
3914 
3915 	/* copy the frame from the payload of raw data msg into the mblk */
3916 	bcopy(dpkt->data, mp->b_rptr, size);
3917 	mp->b_wptr = mp->b_rptr + size;
3918 
3919 	/* update stats */
3920 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3921 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3922 
3923 	/*
3924 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3925 	 */
3926 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3927 
3928 	/* switch the frame to destination */
3929 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3930 }
3931 
3932 /*
3933  * Process an in-band descriptor message (most likely from
3934  * OBP).
3935  */
3936 static void
3937 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3938 {
3939 	vnet_ibnd_desc_t	*ibnd_desc;
3940 	dring_info_t		*dp = NULL;
3941 	vsw_private_desc_t	*priv_addr = NULL;
3942 	vsw_t			*vswp = ldcp->ldc_vswp;
3943 	mblk_t			*mp = NULL;
3944 	size_t			nbytes = 0;
3945 	size_t			off = 0;
3946 	uint64_t		idx = 0;
3947 	uint32_t		num = 1, len, datalen = 0;
3948 	uint64_t		ncookies = 0;
3949 	int			i, rv;
3950 	int			j = 0;
3951 
3952 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3953 
3954 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3955 
3956 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3957 	case VIO_SUBTYPE_INFO:
3958 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3959 
3960 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3961 			return;
3962 
3963 		/*
3964 		 * Data is padded to align on a 8 byte boundary,
3965 		 * nbytes is actual data length, i.e. minus that
3966 		 * padding.
3967 		 */
3968 		datalen = ibnd_desc->nbytes;
3969 
3970 		D2(vswp, "%s(%lld): processing inband desc : "
3971 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3972 
3973 		ncookies = ibnd_desc->ncookies;
3974 
3975 		/*
3976 		 * allocb(9F) returns an aligned data block. We
3977 		 * need to ensure that we ask ldc for an aligned
3978 		 * number of bytes also.
3979 		 */
3980 		nbytes = datalen;
3981 		if (nbytes & 0x7) {
3982 			off = 8 - (nbytes & 0x7);
3983 			nbytes += off;
3984 		}
3985 
3986 		/* alloc extra space for VLAN_TAG */
3987 		mp = allocb(datalen + 8, BPRI_MED);
3988 		if (mp == NULL) {
3989 			DERR(vswp, "%s(%lld): allocb failed",
3990 			    __func__, ldcp->ldc_id);
3991 			ldcp->ldc_stats.rx_allocb_fail++;
3992 			return;
3993 		}
3994 
3995 		/* skip over the extra space for VLAN_TAG */
3996 		mp->b_rptr += 8;
3997 
3998 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3999 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4000 		    LDC_COPY_IN);
4001 
4002 		if (rv != 0) {
4003 			DERR(vswp, "%s(%d): unable to copy in data from "
4004 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
4005 			freemsg(mp);
4006 			ldcp->ldc_stats.ierrors++;
4007 			return;
4008 		}
4009 
4010 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
4011 		    __func__, ldcp->ldc_id, nbytes, ncookies);
4012 
4013 		/* point to the actual end of data */
4014 		mp->b_wptr = mp->b_rptr + datalen;
4015 		ldcp->ldc_stats.ipackets++;
4016 		ldcp->ldc_stats.rbytes += datalen;
4017 
4018 		/*
4019 		 * We ACK back every in-band descriptor message we process
4020 		 */
4021 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4022 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4023 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
4024 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
4025 
4026 		/*
4027 		 * there is extra space alloc'd for VLAN_TAG
4028 		 */
4029 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4030 
4031 		/* send the packet to be switched */
4032 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4033 		    ldcp->ldc_port, NULL);
4034 
4035 		break;
4036 
4037 	case VIO_SUBTYPE_ACK:
4038 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4039 
4040 		/* Verify the ACK is valid */
4041 		idx = ibnd_desc->hdr.desc_handle;
4042 
4043 		if (idx >= vsw_ntxds) {
4044 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
4045 			    "(idx %ld)", vswp->instance, idx);
4046 			return;
4047 		}
4048 
4049 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4050 			DERR(vswp, "%s: no dring found", __func__);
4051 			return;
4052 		}
4053 
4054 		len = dp->num_descriptors;
4055 		/*
4056 		 * If the descriptor we are being ACK'ed for is not the
4057 		 * one we expected, then pkts were lost somwhere, either
4058 		 * when we tried to send a msg, or a previous ACK msg from
4059 		 * our peer. In either case we now reclaim the descriptors
4060 		 * in the range from the last ACK we received up to the
4061 		 * current ACK.
4062 		 */
4063 		if (idx != dp->last_ack_recv) {
4064 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
4065 			    __func__, dp->last_ack_recv, idx);
4066 			num = idx >= dp->last_ack_recv ?
4067 			    idx - dp->last_ack_recv + 1:
4068 			    (len - dp->last_ack_recv + 1) + idx;
4069 		}
4070 
4071 		/*
4072 		 * When we sent the in-band message to our peer we
4073 		 * marked the copy in our private ring as READY. We now
4074 		 * check that the descriptor we are being ACK'ed for is in
4075 		 * fact READY, i.e. it is one we have shared with our peer.
4076 		 *
4077 		 * If its not we flag an error, but still reset the descr
4078 		 * back to FREE.
4079 		 */
4080 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
4081 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4082 			mutex_enter(&priv_addr->dstate_lock);
4083 			if (priv_addr->dstate != VIO_DESC_READY) {
4084 				DERR(vswp, "%s: (%ld) desc at index %ld not "
4085 				    "READY (0x%lx)", __func__,
4086 				    ldcp->ldc_id, idx, priv_addr->dstate);
4087 				DERR(vswp, "%s: bound %d: ncookies %ld : "
4088 				    "datalen %ld", __func__,
4089 				    priv_addr->bound, priv_addr->ncookies,
4090 				    priv_addr->datalen);
4091 			}
4092 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4093 			    ldcp->ldc_id, idx);
4094 			/* release resources associated with sent msg */
4095 			priv_addr->datalen = 0;
4096 			priv_addr->dstate = VIO_DESC_FREE;
4097 			mutex_exit(&priv_addr->dstate_lock);
4098 		}
4099 		/* update to next expected value */
4100 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
4101 
4102 		break;
4103 
4104 	case VIO_SUBTYPE_NACK:
4105 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4106 
4107 		/*
4108 		 * We should only get a NACK if our peer doesn't like
4109 		 * something about a message we have sent it. If this
4110 		 * happens we just release the resources associated with
4111 		 * the message. (We are relying on higher layers to decide
4112 		 * whether or not to resend.
4113 		 */
4114 
4115 		/* limit check */
4116 		idx = ibnd_desc->hdr.desc_handle;
4117 
4118 		if (idx >= vsw_ntxds) {
4119 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4120 			    __func__, idx);
4121 			return;
4122 		}
4123 
4124 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4125 			DERR(vswp, "%s: no dring found", __func__);
4126 			return;
4127 		}
4128 
4129 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4130 
4131 		/* move to correct location in ring */
4132 		priv_addr += idx;
4133 
4134 		/* release resources associated with sent msg */
4135 		mutex_enter(&priv_addr->dstate_lock);
4136 		priv_addr->datalen = 0;
4137 		priv_addr->dstate = VIO_DESC_FREE;
4138 		mutex_exit(&priv_addr->dstate_lock);
4139 
4140 		break;
4141 
4142 	default:
4143 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4144 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4145 	}
4146 
4147 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4148 }
4149 
4150 static void
4151 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
4152 {
4153 	_NOTE(ARGUNUSED(epkt))
4154 
4155 	vsw_t		*vswp = ldcp->ldc_vswp;
4156 	uint16_t	env = tagp->vio_subtype_env;
4157 
4158 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4159 
4160 	/*
4161 	 * Error vio_subtypes have yet to be defined. So for
4162 	 * the moment we can't do anything.
4163 	 */
4164 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4165 
4166 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4167 }
4168 
4169 /* transmit the packet over the given port */
4170 int
4171 vsw_portsend(vsw_port_t *port, mblk_t *mp)
4172 {
4173 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
4174 	vsw_ldc_t 	*ldcp;
4175 	mblk_t		*mpt;
4176 	int		count;
4177 	int		status = 0;
4178 
4179 	READ_ENTER(&ldcl->lockrw);
4180 	/*
4181 	 * Note for now, we have a single channel.
4182 	 */
4183 	ldcp = ldcl->head;
4184 	if (ldcp == NULL) {
4185 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
4186 		freemsgchain(mp);
4187 		RW_EXIT(&ldcl->lockrw);
4188 		return (1);
4189 	}
4190 
4191 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
4192 
4193 	if (count != 0) {
4194 		status = ldcp->tx(ldcp, mp, mpt, count);
4195 	}
4196 
4197 	RW_EXIT(&ldcl->lockrw);
4198 	return (status);
4199 }
4200 
4201 /*
4202  * Break up frames into 2 seperate chains: normal and
4203  * priority, based on the frame type. The number of
4204  * priority frames is also counted and returned.
4205  *
4206  * Params:
4207  * 	vswp:	pointer to the instance of vsw
4208  *	np:	head of packet chain to be broken
4209  *	npt:	tail of packet chain to be broken
4210  *
4211  * Returns:
4212  *	np:	head of normal data packets
4213  *	npt:	tail of normal data packets
4214  *	hp:	head of high priority packets
4215  *	hpt:	tail of high priority packets
4216  */
4217 static uint32_t
4218 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4219 	mblk_t **hp, mblk_t **hpt)
4220 {
4221 	mblk_t			*tmp = NULL;
4222 	mblk_t			*smp = NULL;
4223 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4224 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4225 	mblk_t			*nmp = NULL;	/* normal pkts head */
4226 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4227 	uint32_t		count = 0;
4228 	int			i;
4229 	struct ether_header	*ehp;
4230 	uint32_t		num_types;
4231 	uint16_t		*types;
4232 
4233 	tmp = *np;
4234 	while (tmp != NULL) {
4235 
4236 		smp = tmp;
4237 		tmp = tmp->b_next;
4238 		smp->b_next = NULL;
4239 		smp->b_prev = NULL;
4240 
4241 		ehp = (struct ether_header *)smp->b_rptr;
4242 		num_types = vswp->pri_num_types;
4243 		types = vswp->pri_types;
4244 		for (i = 0; i < num_types; i++) {
4245 			if (ehp->ether_type == types[i]) {
4246 				/* high priority frame */
4247 
4248 				if (hmp != NULL) {
4249 					hmpt->b_next = smp;
4250 					hmpt = smp;
4251 				} else {
4252 					hmp = hmpt = smp;
4253 				}
4254 				count++;
4255 				break;
4256 			}
4257 		}
4258 		if (i == num_types) {
4259 			/* normal data frame */
4260 
4261 			if (nmp != NULL) {
4262 				nmpt->b_next = smp;
4263 				nmpt = smp;
4264 			} else {
4265 				nmp = nmpt = smp;
4266 			}
4267 		}
4268 	}
4269 
4270 	*hp = hmp;
4271 	*hpt = hmpt;
4272 	*np = nmp;
4273 	*npt = nmpt;
4274 
4275 	return (count);
4276 }
4277 
4278 /*
4279  * Wrapper function to transmit normal and/or priority frames over the channel.
4280  */
4281 static int
4282 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4283 {
4284 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4285 	mblk_t			*tmp;
4286 	mblk_t			*smp;
4287 	mblk_t			*hmp;	/* high prio pkts head */
4288 	mblk_t			*hmpt;	/* high prio pkts tail */
4289 	mblk_t			*nmp;	/* normal pkts head */
4290 	mblk_t			*nmpt;	/* normal pkts tail */
4291 	uint32_t		n = 0;
4292 	vsw_t			*vswp = ldcp->ldc_vswp;
4293 
4294 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4295 	ASSERT(count != 0);
4296 
4297 	nmp = mp;
4298 	nmpt = mpt;
4299 
4300 	/* gather any priority frames from the chain of packets */
4301 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4302 
4303 	/* transmit priority frames */
4304 	tmp = hmp;
4305 	while (tmp != NULL) {
4306 		smp = tmp;
4307 		tmp = tmp->b_next;
4308 		smp->b_next = NULL;
4309 		vsw_ldcsend_pkt(ldcp, smp);
4310 	}
4311 
4312 	count -= n;
4313 
4314 	if (count == 0) {
4315 		/* no normal data frames to process */
4316 		return (0);
4317 	}
4318 
4319 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4320 }
4321 
4322 /*
4323  * Wrapper function to transmit normal frames over the channel.
4324  */
4325 static int
4326 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4327 {
4328 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4329 	mblk_t		*tmp = NULL;
4330 
4331 	ASSERT(count != 0);
4332 	/*
4333 	 * If the TX thread is enabled, then queue the
4334 	 * ordinary frames and signal the tx thread.
4335 	 */
4336 	if (ldcp->tx_thread != NULL) {
4337 
4338 		mutex_enter(&ldcp->tx_thr_lock);
4339 
4340 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4341 			/*
4342 			 * If we reached queue limit,
4343 			 * do not queue new packets,
4344 			 * drop them.
4345 			 */
4346 			ldcp->ldc_stats.tx_qfull += count;
4347 			mutex_exit(&ldcp->tx_thr_lock);
4348 			freemsgchain(mp);
4349 			goto exit;
4350 		}
4351 		if (ldcp->tx_mhead == NULL) {
4352 			ldcp->tx_mhead = mp;
4353 			ldcp->tx_mtail = mpt;
4354 			cv_signal(&ldcp->tx_thr_cv);
4355 		} else {
4356 			ldcp->tx_mtail->b_next = mp;
4357 			ldcp->tx_mtail = mpt;
4358 		}
4359 		ldcp->tx_cnt += count;
4360 		mutex_exit(&ldcp->tx_thr_lock);
4361 	} else {
4362 		while (mp != NULL) {
4363 			tmp = mp->b_next;
4364 			mp->b_next = mp->b_prev = NULL;
4365 			(void) vsw_ldcsend(ldcp, mp, 1);
4366 			mp = tmp;
4367 		}
4368 	}
4369 
4370 exit:
4371 	return (0);
4372 }
4373 
4374 /*
4375  * This function transmits the frame in the payload of a raw data
4376  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4377  * send special frames with high priorities, without going through
4378  * the normal data path which uses descriptor ring mechanism.
4379  */
4380 static void
4381 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4382 {
4383 	vio_raw_data_msg_t	*pkt;
4384 	mblk_t			*bp;
4385 	mblk_t			*nmp = NULL;
4386 	caddr_t			dst;
4387 	uint32_t		mblksz;
4388 	uint32_t		size;
4389 	uint32_t		nbytes;
4390 	int			rv;
4391 	vsw_t			*vswp = ldcp->ldc_vswp;
4392 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4393 
4394 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4395 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4396 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4397 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4398 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4399 		    ldcp->lane_out.lstate);
4400 		goto send_pkt_exit;
4401 	}
4402 
4403 	size = msgsize(mp);
4404 
4405 	/* frame size bigger than available payload len of raw data msg ? */
4406 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4407 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4408 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4409 		    ldcp->ldc_id, size);
4410 		goto send_pkt_exit;
4411 	}
4412 
4413 	if (size < ETHERMIN)
4414 		size = ETHERMIN;
4415 
4416 	/* alloc space for a raw data message */
4417 	nmp = vio_allocb(vswp->pri_tx_vmp);
4418 	if (nmp == NULL) {
4419 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4420 		DWARN(vswp, "vio_allocb failed\n");
4421 		goto send_pkt_exit;
4422 	}
4423 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4424 
4425 	/* copy frame into the payload of raw data message */
4426 	dst = (caddr_t)pkt->data;
4427 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4428 		mblksz = MBLKL(bp);
4429 		bcopy(bp->b_rptr, dst, mblksz);
4430 		dst += mblksz;
4431 	}
4432 
4433 	/* setup the raw data msg */
4434 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4435 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4436 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4437 	pkt->tag.vio_sid = ldcp->local_session;
4438 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4439 
4440 	/* send the msg over ldc */
4441 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4442 	if (rv != 0) {
4443 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4444 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4445 		    ldcp->ldc_id);
4446 		goto send_pkt_exit;
4447 	}
4448 
4449 	/* update stats */
4450 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4451 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4452 
4453 send_pkt_exit:
4454 	if (nmp != NULL)
4455 		freemsg(nmp);
4456 	freemsg(mp);
4457 }
4458 
4459 /*
4460  * Transmit the packet over the given LDC channel.
4461  *
4462  * The 'retries' argument indicates how many times a packet
4463  * is retried before it is dropped. Note, the retry is done
4464  * only for a resource related failure, for all other failures
4465  * the packet is dropped immediately.
4466  */
4467 static int
4468 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4469 {
4470 	int i;
4471 	int rc;
4472 	int status = 0;
4473 	vsw_port_t *port = ldcp->ldc_port;
4474 	dring_info_t *dp = NULL;
4475 
4476 
4477 	for (i = 0; i < retries; ) {
4478 		/*
4479 		 * Send the message out using the appropriate
4480 		 * transmit function which will free mblock when it
4481 		 * is finished with it.
4482 		 */
4483 		mutex_enter(&port->tx_lock);
4484 		if (port->transmit != NULL) {
4485 			status = (*port->transmit)(ldcp, mp);
4486 		}
4487 		if (status == LDC_TX_SUCCESS) {
4488 			mutex_exit(&port->tx_lock);
4489 			break;
4490 		}
4491 		i++;	/* increment the counter here */
4492 
4493 		/* If its the last retry, then update the oerror */
4494 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4495 			ldcp->ldc_stats.oerrors++;
4496 		}
4497 		mutex_exit(&port->tx_lock);
4498 
4499 		if (status != LDC_TX_NORESOURCES) {
4500 			/*
4501 			 * No retrying required for errors un-related
4502 			 * to resources.
4503 			 */
4504 			break;
4505 		}
4506 		READ_ENTER(&ldcp->lane_out.dlistrw);
4507 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4508 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4509 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4510 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4511 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4512 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4513 		} else {
4514 			/*
4515 			 * If there is no dring or the xfer_mode is
4516 			 * set to DESC_MODE(ie., OBP), then simply break here.
4517 			 */
4518 			RW_EXIT(&ldcp->lane_out.dlistrw);
4519 			break;
4520 		}
4521 		RW_EXIT(&ldcp->lane_out.dlistrw);
4522 
4523 		/*
4524 		 * Delay only if none were reclaimed
4525 		 * and its not the last retry.
4526 		 */
4527 		if ((rc == 0) && (i < retries)) {
4528 			delay(drv_usectohz(vsw_ldc_tx_delay));
4529 		}
4530 	}
4531 	freemsg(mp);
4532 	return (status);
4533 }
4534 
4535 /*
4536  * Send packet out via descriptor ring to a logical device.
4537  */
4538 static int
4539 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4540 {
4541 	vio_dring_msg_t		dring_pkt;
4542 	dring_info_t		*dp = NULL;
4543 	vsw_private_desc_t	*priv_desc = NULL;
4544 	vnet_public_desc_t	*pub = NULL;
4545 	vsw_t			*vswp = ldcp->ldc_vswp;
4546 	mblk_t			*bp;
4547 	size_t			n, size;
4548 	caddr_t			bufp;
4549 	int			idx;
4550 	int			status = LDC_TX_SUCCESS;
4551 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4552 	lane_t			*lp = &ldcp->lane_out;
4553 
4554 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4555 
4556 	/* TODO: make test a macro */
4557 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4558 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4559 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4560 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4561 		    ldcp->lane_out.lstate);
4562 		ldcp->ldc_stats.oerrors++;
4563 		return (LDC_TX_FAILURE);
4564 	}
4565 
4566 	/*
4567 	 * Note - using first ring only, this may change
4568 	 * in the future.
4569 	 */
4570 	READ_ENTER(&ldcp->lane_out.dlistrw);
4571 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4572 		RW_EXIT(&ldcp->lane_out.dlistrw);
4573 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4574 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4575 		ldcp->ldc_stats.oerrors++;
4576 		return (LDC_TX_FAILURE);
4577 	}
4578 
4579 	size = msgsize(mp);
4580 	if (size > (size_t)lp->mtu) {
4581 		RW_EXIT(&ldcp->lane_out.dlistrw);
4582 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4583 		    ldcp->ldc_id, size);
4584 		ldcp->ldc_stats.oerrors++;
4585 		return (LDC_TX_FAILURE);
4586 	}
4587 
4588 	/*
4589 	 * Find a free descriptor
4590 	 *
4591 	 * Note: for the moment we are assuming that we will only
4592 	 * have one dring going from the switch to each of its
4593 	 * peers. This may change in the future.
4594 	 */
4595 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4596 		D2(vswp, "%s(%lld): no descriptor available for ring "
4597 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4598 
4599 		/* nothing more we can do */
4600 		status = LDC_TX_NORESOURCES;
4601 		ldcp->ldc_stats.tx_no_desc++;
4602 		goto vsw_dringsend_free_exit;
4603 	} else {
4604 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4605 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4606 	}
4607 
4608 	/* copy data into the descriptor */
4609 	bufp = priv_desc->datap;
4610 	bufp += VNET_IPALIGN;
4611 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4612 		n = MBLKL(bp);
4613 		bcopy(bp->b_rptr, bufp, n);
4614 		bufp += n;
4615 	}
4616 
4617 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4618 
4619 	pub = priv_desc->descp;
4620 	pub->nbytes = priv_desc->datalen;
4621 
4622 	/* update statistics */
4623 	if (IS_BROADCAST(ehp))
4624 		ldcp->ldc_stats.brdcstxmt++;
4625 	else if (IS_MULTICAST(ehp))
4626 		ldcp->ldc_stats.multixmt++;
4627 	ldcp->ldc_stats.opackets++;
4628 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4629 
4630 	mutex_enter(&priv_desc->dstate_lock);
4631 	pub->hdr.dstate = VIO_DESC_READY;
4632 	mutex_exit(&priv_desc->dstate_lock);
4633 
4634 	/*
4635 	 * Determine whether or not we need to send a message to our
4636 	 * peer prompting them to read our newly updated descriptor(s).
4637 	 */
4638 	mutex_enter(&dp->restart_lock);
4639 	if (dp->restart_reqd) {
4640 		dp->restart_reqd = B_FALSE;
4641 		ldcp->ldc_stats.dring_data_msgs++;
4642 		mutex_exit(&dp->restart_lock);
4643 
4644 		/*
4645 		 * Send a vio_dring_msg to peer to prompt them to read
4646 		 * the updated descriptor ring.
4647 		 */
4648 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4649 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4650 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4651 		dring_pkt.tag.vio_sid = ldcp->local_session;
4652 
4653 		/* Note - for now using first ring */
4654 		dring_pkt.dring_ident = dp->ident;
4655 
4656 		/*
4657 		 * If last_ack_recv is -1 then we know we've not
4658 		 * received any ack's yet, so this must be the first
4659 		 * msg sent, so set the start to the begining of the ring.
4660 		 */
4661 		mutex_enter(&dp->dlock);
4662 		if (dp->last_ack_recv == -1) {
4663 			dring_pkt.start_idx = 0;
4664 		} else {
4665 			dring_pkt.start_idx =
4666 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4667 		}
4668 		dring_pkt.end_idx = -1;
4669 		mutex_exit(&dp->dlock);
4670 
4671 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4672 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4673 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4674 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4675 		    dring_pkt.end_idx);
4676 
4677 		RW_EXIT(&ldcp->lane_out.dlistrw);
4678 
4679 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4680 		    sizeof (vio_dring_msg_t), B_TRUE);
4681 
4682 		return (status);
4683 
4684 	} else {
4685 		mutex_exit(&dp->restart_lock);
4686 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4687 		    ldcp->ldc_id, idx);
4688 	}
4689 
4690 vsw_dringsend_free_exit:
4691 
4692 	RW_EXIT(&ldcp->lane_out.dlistrw);
4693 
4694 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4695 	return (status);
4696 }
4697 
4698 /*
4699  * Send an in-band descriptor message over ldc.
4700  */
4701 static int
4702 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4703 {
4704 	vsw_t			*vswp = ldcp->ldc_vswp;
4705 	vnet_ibnd_desc_t	ibnd_msg;
4706 	vsw_private_desc_t	*priv_desc = NULL;
4707 	dring_info_t		*dp = NULL;
4708 	size_t			n, size = 0;
4709 	caddr_t			bufp;
4710 	mblk_t			*bp;
4711 	int			idx, i;
4712 	int			status = LDC_TX_SUCCESS;
4713 	static int		warn_msg = 1;
4714 	lane_t			*lp = &ldcp->lane_out;
4715 
4716 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4717 
4718 	ASSERT(mp != NULL);
4719 
4720 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4721 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4722 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4723 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4724 		    ldcp->lane_out.lstate);
4725 		ldcp->ldc_stats.oerrors++;
4726 		return (LDC_TX_FAILURE);
4727 	}
4728 
4729 	/*
4730 	 * only expect single dring to exist, which we use
4731 	 * as an internal buffer, rather than a transfer channel.
4732 	 */
4733 	READ_ENTER(&ldcp->lane_out.dlistrw);
4734 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4735 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4736 		    __func__, ldcp->ldc_id);
4737 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4738 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4739 		RW_EXIT(&ldcp->lane_out.dlistrw);
4740 		ldcp->ldc_stats.oerrors++;
4741 		return (LDC_TX_FAILURE);
4742 	}
4743 
4744 	size = msgsize(mp);
4745 	if (size > (size_t)lp->mtu) {
4746 		RW_EXIT(&ldcp->lane_out.dlistrw);
4747 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4748 		    ldcp->ldc_id, size);
4749 		ldcp->ldc_stats.oerrors++;
4750 		return (LDC_TX_FAILURE);
4751 	}
4752 
4753 	/*
4754 	 * Find a free descriptor in our buffer ring
4755 	 */
4756 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4757 		RW_EXIT(&ldcp->lane_out.dlistrw);
4758 		if (warn_msg) {
4759 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4760 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4761 			warn_msg = 0;
4762 		}
4763 
4764 		/* nothing more we can do */
4765 		status = LDC_TX_NORESOURCES;
4766 		goto vsw_descrsend_free_exit;
4767 	} else {
4768 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4769 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4770 		warn_msg = 1;
4771 	}
4772 
4773 	/* copy data into the descriptor */
4774 	bufp = priv_desc->datap;
4775 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4776 		n = MBLKL(bp);
4777 		bcopy(bp->b_rptr, bufp, n);
4778 		bufp += n;
4779 	}
4780 
4781 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4782 
4783 	/* create and send the in-band descp msg */
4784 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4785 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4786 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4787 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4788 
4789 	/*
4790 	 * Copy the mem cookies describing the data from the
4791 	 * private region of the descriptor ring into the inband
4792 	 * descriptor.
4793 	 */
4794 	for (i = 0; i < priv_desc->ncookies; i++) {
4795 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4796 		    sizeof (ldc_mem_cookie_t));
4797 	}
4798 
4799 	ibnd_msg.hdr.desc_handle = idx;
4800 	ibnd_msg.ncookies = priv_desc->ncookies;
4801 	ibnd_msg.nbytes = size;
4802 
4803 	ldcp->ldc_stats.opackets++;
4804 	ldcp->ldc_stats.obytes += size;
4805 
4806 	RW_EXIT(&ldcp->lane_out.dlistrw);
4807 
4808 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4809 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4810 
4811 vsw_descrsend_free_exit:
4812 
4813 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4814 	return (status);
4815 }
4816 
4817 static void
4818 vsw_send_ver(void *arg)
4819 {
4820 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4821 	vsw_t		*vswp = ldcp->ldc_vswp;
4822 	lane_t		*lp = &ldcp->lane_out;
4823 	vio_ver_msg_t	ver_msg;
4824 
4825 	D1(vswp, "%s enter", __func__);
4826 
4827 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4828 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4829 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4830 	ver_msg.tag.vio_sid = ldcp->local_session;
4831 
4832 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4833 		ver_msg.ver_major = vsw_versions[0].ver_major;
4834 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4835 	} else {
4836 		/* use the major,minor that we've ack'd */
4837 		lane_t	*lpi = &ldcp->lane_in;
4838 		ver_msg.ver_major = lpi->ver_major;
4839 		ver_msg.ver_minor = lpi->ver_minor;
4840 	}
4841 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4842 
4843 	lp->lstate |= VSW_VER_INFO_SENT;
4844 	lp->ver_major = ver_msg.ver_major;
4845 	lp->ver_minor = ver_msg.ver_minor;
4846 
4847 	DUMP_TAG(ver_msg.tag);
4848 
4849 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4850 
4851 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4852 }
4853 
4854 static void
4855 vsw_send_attr(vsw_ldc_t *ldcp)
4856 {
4857 	vsw_t			*vswp = ldcp->ldc_vswp;
4858 	lane_t			*lp = &ldcp->lane_out;
4859 	vnet_attr_msg_t		attr_msg;
4860 
4861 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4862 
4863 	/*
4864 	 * Subtype is set to INFO by default
4865 	 */
4866 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4867 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4868 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4869 	attr_msg.tag.vio_sid = ldcp->local_session;
4870 
4871 	/* payload copied from default settings for lane */
4872 	attr_msg.mtu = lp->mtu;
4873 	attr_msg.addr_type = lp->addr_type;
4874 	attr_msg.xfer_mode = lp->xfer_mode;
4875 	attr_msg.ack_freq = lp->xfer_mode;
4876 
4877 	READ_ENTER(&vswp->if_lockrw);
4878 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4879 	RW_EXIT(&vswp->if_lockrw);
4880 
4881 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4882 
4883 	DUMP_TAG(attr_msg.tag);
4884 
4885 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4886 
4887 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4888 }
4889 
4890 /*
4891  * Create dring info msg (which also results in the creation of
4892  * a dring).
4893  */
4894 static vio_dring_reg_msg_t *
4895 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4896 {
4897 	vio_dring_reg_msg_t	*mp;
4898 	dring_info_t		*dp;
4899 	vsw_t			*vswp = ldcp->ldc_vswp;
4900 	int			rv;
4901 
4902 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4903 
4904 	/*
4905 	 * If we can't create a dring, obviously no point sending
4906 	 * a message.
4907 	 */
4908 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4909 		return (NULL);
4910 
4911 	/* Allocate pools of receive mblks */
4912 	rv = vsw_init_multipools(ldcp, vswp);
4913 	if (rv) {
4914 		/*
4915 		 * We do not return failure if receive mblk pools can't be
4916 		 * allocated, instead allocb(9F) will be used to dynamically
4917 		 * allocate buffers during receive.
4918 		 */
4919 		DWARN(vswp, "%s: unable to create free mblk pools for"
4920 		    " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
4921 	}
4922 
4923 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4924 
4925 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4926 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4927 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4928 	mp->tag.vio_sid = ldcp->local_session;
4929 
4930 	/* payload */
4931 	mp->num_descriptors = dp->num_descriptors;
4932 	mp->descriptor_size = dp->descriptor_size;
4933 	mp->options = dp->options;
4934 	mp->ncookies = dp->ncookies;
4935 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4936 
4937 	mp->dring_ident = 0;
4938 
4939 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4940 
4941 	return (mp);
4942 }
4943 
4944 static void
4945 vsw_send_dring_info(vsw_ldc_t *ldcp)
4946 {
4947 	vio_dring_reg_msg_t	*dring_msg;
4948 	vsw_t			*vswp = ldcp->ldc_vswp;
4949 
4950 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4951 
4952 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4953 	if (dring_msg == NULL) {
4954 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4955 		    vswp->instance, __func__);
4956 		return;
4957 	}
4958 
4959 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4960 
4961 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4962 
4963 	(void) vsw_send_msg(ldcp, dring_msg,
4964 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4965 
4966 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4967 
4968 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4969 }
4970 
4971 static void
4972 vsw_send_rdx(vsw_ldc_t *ldcp)
4973 {
4974 	vsw_t		*vswp = ldcp->ldc_vswp;
4975 	vio_rdx_msg_t	rdx_msg;
4976 
4977 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4978 
4979 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4980 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4981 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4982 	rdx_msg.tag.vio_sid = ldcp->local_session;
4983 
4984 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4985 
4986 	DUMP_TAG(rdx_msg.tag);
4987 
4988 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4989 
4990 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4991 }
4992 
4993 /*
4994  * Generic routine to send message out over ldc channel.
4995  *
4996  * It is possible that when we attempt to write over the ldc channel
4997  * that we get notified that it has been reset. Depending on the value
4998  * of the handle_reset flag we either handle that event here or simply
4999  * notify the caller that the channel was reset.
5000  */
5001 int
5002 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
5003 {
5004 	int			rv;
5005 	size_t			msglen = size;
5006 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
5007 	vsw_t			*vswp = ldcp->ldc_vswp;
5008 	vio_dring_msg_t		*dmsg;
5009 	vio_raw_data_msg_t	*rmsg;
5010 	vnet_ibnd_desc_t	*imsg;
5011 	boolean_t		data_msg = B_FALSE;
5012 
5013 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5014 	    ldcp->ldc_id, size);
5015 
5016 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5017 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5018 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5019 
5020 	mutex_enter(&ldcp->ldc_txlock);
5021 
5022 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
5023 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
5024 			dmsg = (vio_dring_msg_t *)tag;
5025 			dmsg->seq_num = ldcp->lane_out.seq_num;
5026 			data_msg = B_TRUE;
5027 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
5028 			rmsg = (vio_raw_data_msg_t *)tag;
5029 			rmsg->seq_num = ldcp->lane_out.seq_num;
5030 			data_msg = B_TRUE;
5031 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
5032 			imsg = (vnet_ibnd_desc_t *)tag;
5033 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
5034 			data_msg = B_TRUE;
5035 		}
5036 	}
5037 
5038 	do {
5039 		msglen = size;
5040 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5041 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5042 
5043 	if (rv == 0 && data_msg == B_TRUE) {
5044 		ldcp->lane_out.seq_num++;
5045 	}
5046 
5047 	if ((rv != 0) || (msglen != size)) {
5048 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
5049 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
5050 		ldcp->ldc_stats.oerrors++;
5051 	}
5052 
5053 	mutex_exit(&ldcp->ldc_txlock);
5054 
5055 	/*
5056 	 * If channel has been reset we either handle it here or
5057 	 * simply report back that it has been reset and let caller
5058 	 * decide what to do.
5059 	 */
5060 	if (rv == ECONNRESET) {
5061 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
5062 
5063 		/*
5064 		 * N.B - must never be holding the dlistrw lock when
5065 		 * we do a reset of the channel.
5066 		 */
5067 		if (handle_reset) {
5068 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5069 		}
5070 	}
5071 
5072 	return (rv);
5073 }
5074 
5075 /*
5076  * Remove the specified address from the list of address maintained
5077  * in this port node.
5078  */
5079 mcst_addr_t *
5080 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
5081 {
5082 	vsw_t		*vswp = NULL;
5083 	vsw_port_t	*port = NULL;
5084 	mcst_addr_t	*prev_p = NULL;
5085 	mcst_addr_t	*curr_p = NULL;
5086 
5087 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
5088 	    __func__, devtype, addr);
5089 
5090 	if (devtype == VSW_VNETPORT) {
5091 		port = (vsw_port_t *)arg;
5092 		mutex_enter(&port->mca_lock);
5093 		prev_p = curr_p = port->mcap;
5094 	} else {
5095 		vswp = (vsw_t *)arg;
5096 		mutex_enter(&vswp->mca_lock);
5097 		prev_p = curr_p = vswp->mcap;
5098 	}
5099 
5100 	while (curr_p != NULL) {
5101 		if (curr_p->addr == addr) {
5102 			D2(NULL, "%s: address found", __func__);
5103 			/* match found */
5104 			if (prev_p == curr_p) {
5105 				/* list head */
5106 				if (devtype == VSW_VNETPORT)
5107 					port->mcap = curr_p->nextp;
5108 				else
5109 					vswp->mcap = curr_p->nextp;
5110 			} else {
5111 				prev_p->nextp = curr_p->nextp;
5112 			}
5113 			break;
5114 		} else {
5115 			prev_p = curr_p;
5116 			curr_p = curr_p->nextp;
5117 		}
5118 	}
5119 
5120 	if (devtype == VSW_VNETPORT)
5121 		mutex_exit(&port->mca_lock);
5122 	else
5123 		mutex_exit(&vswp->mca_lock);
5124 
5125 	D1(NULL, "%s: exit", __func__);
5126 
5127 	return (curr_p);
5128 }
5129 
5130 /*
5131  * Creates a descriptor ring (dring) and links it into the
5132  * link of outbound drings for this channel.
5133  *
5134  * Returns NULL if creation failed.
5135  */
5136 static dring_info_t *
5137 vsw_create_dring(vsw_ldc_t *ldcp)
5138 {
5139 	vsw_private_desc_t	*priv_addr = NULL;
5140 	vsw_t			*vswp = ldcp->ldc_vswp;
5141 	ldc_mem_info_t		minfo;
5142 	dring_info_t		*dp, *tp;
5143 	int			i;
5144 
5145 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5146 
5147 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5148 
5149 	/* create public section of ring */
5150 	if ((ldc_mem_dring_create(vsw_ntxds,
5151 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
5152 
5153 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
5154 		    "failed", ldcp->ldc_id);
5155 		goto create_fail_exit;
5156 	}
5157 
5158 	ASSERT(dp->handle != NULL);
5159 
5160 	/*
5161 	 * Get the base address of the public section of the ring.
5162 	 */
5163 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5164 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
5165 		    ldcp->ldc_id);
5166 		goto dring_fail_exit;
5167 	} else {
5168 		ASSERT(minfo.vaddr != 0);
5169 		dp->pub_addr = minfo.vaddr;
5170 	}
5171 
5172 	dp->num_descriptors = vsw_ntxds;
5173 	dp->descriptor_size = VSW_PUB_SIZE;
5174 	dp->options = VIO_TX_DRING;
5175 	dp->ncookies = 1;	/* guaranteed by ldc */
5176 
5177 	/*
5178 	 * create private portion of ring
5179 	 */
5180 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
5181 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5182 
5183 	if (vsw_setup_ring(ldcp, dp)) {
5184 		DERR(vswp, "%s: unable to setup ring", __func__);
5185 		goto dring_fail_exit;
5186 	}
5187 
5188 	/* haven't used any descriptors yet */
5189 	dp->end_idx = 0;
5190 	dp->last_ack_recv = -1;
5191 
5192 	/* bind dring to the channel */
5193 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
5194 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
5195 	    &dp->cookie[0], &dp->ncookies)) != 0) {
5196 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
5197 		    "%lld", ldcp->ldc_id);
5198 		goto dring_fail_exit;
5199 	}
5200 
5201 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5202 	dp->restart_reqd = B_TRUE;
5203 
5204 	/*
5205 	 * Only ever create rings for outgoing lane. Link it onto
5206 	 * end of list.
5207 	 */
5208 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5209 	if (ldcp->lane_out.dringp == NULL) {
5210 		D2(vswp, "vsw_create_dring: adding first outbound ring");
5211 		ldcp->lane_out.dringp = dp;
5212 	} else {
5213 		tp = ldcp->lane_out.dringp;
5214 		while (tp->next != NULL)
5215 			tp = tp->next;
5216 
5217 		tp->next = dp;
5218 	}
5219 	RW_EXIT(&ldcp->lane_out.dlistrw);
5220 
5221 	return (dp);
5222 
5223 dring_fail_exit:
5224 	(void) ldc_mem_dring_destroy(dp->handle);
5225 
5226 create_fail_exit:
5227 	if (dp->priv_addr != NULL) {
5228 		priv_addr = dp->priv_addr;
5229 		for (i = 0; i < vsw_ntxds; i++) {
5230 			if (priv_addr->memhandle != NULL)
5231 				(void) ldc_mem_free_handle(
5232 				    priv_addr->memhandle);
5233 			priv_addr++;
5234 		}
5235 		kmem_free(dp->priv_addr,
5236 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5237 	}
5238 	mutex_destroy(&dp->dlock);
5239 
5240 	kmem_free(dp, sizeof (dring_info_t));
5241 	return (NULL);
5242 }
5243 
5244 /*
5245  * Create a ring consisting of just a private portion and link
5246  * it into the list of rings for the outbound lane.
5247  *
5248  * These type of rings are used primarily for temporary data
5249  * storage (i.e. as data buffers).
5250  */
5251 void
5252 vsw_create_privring(vsw_ldc_t *ldcp)
5253 {
5254 	dring_info_t		*dp, *tp;
5255 	vsw_t			*vswp = ldcp->ldc_vswp;
5256 
5257 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5258 
5259 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5260 
5261 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5262 
5263 	/* no public section */
5264 	dp->pub_addr = NULL;
5265 
5266 	dp->priv_addr = kmem_zalloc(
5267 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5268 
5269 	dp->num_descriptors = vsw_ntxds;
5270 
5271 	if (vsw_setup_ring(ldcp, dp)) {
5272 		DERR(vswp, "%s: setup of ring failed", __func__);
5273 		kmem_free(dp->priv_addr,
5274 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5275 		mutex_destroy(&dp->dlock);
5276 		kmem_free(dp, sizeof (dring_info_t));
5277 		return;
5278 	}
5279 
5280 	/* haven't used any descriptors yet */
5281 	dp->end_idx = 0;
5282 
5283 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5284 	dp->restart_reqd = B_TRUE;
5285 
5286 	/*
5287 	 * Only ever create rings for outgoing lane. Link it onto
5288 	 * end of list.
5289 	 */
5290 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5291 	if (ldcp->lane_out.dringp == NULL) {
5292 		D2(vswp, "%s: adding first outbound privring", __func__);
5293 		ldcp->lane_out.dringp = dp;
5294 	} else {
5295 		tp = ldcp->lane_out.dringp;
5296 		while (tp->next != NULL)
5297 			tp = tp->next;
5298 
5299 		tp->next = dp;
5300 	}
5301 	RW_EXIT(&ldcp->lane_out.dlistrw);
5302 
5303 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5304 }
5305 
5306 /*
5307  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5308  * failure.
5309  */
5310 int
5311 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5312 {
5313 	vnet_public_desc_t	*pub_addr = NULL;
5314 	vsw_private_desc_t	*priv_addr = NULL;
5315 	vsw_t			*vswp = ldcp->ldc_vswp;
5316 	uint64_t		*tmpp;
5317 	uint64_t		offset = 0;
5318 	uint32_t		ncookies = 0;
5319 	static char		*name = "vsw_setup_ring";
5320 	int			i, j, nc, rv;
5321 	size_t			data_sz;
5322 	void			*data_addr;
5323 
5324 	priv_addr = dp->priv_addr;
5325 	pub_addr = dp->pub_addr;
5326 
5327 	/* public section may be null but private should never be */
5328 	ASSERT(priv_addr != NULL);
5329 
5330 	/*
5331 	 * Allocate the region of memory which will be used to hold
5332 	 * the data the descriptors will refer to.
5333 	 */
5334 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5335 
5336 	/*
5337 	 * In order to ensure that the number of ldc cookies per descriptor is
5338 	 * limited to be within the default MAX_COOKIES (2), we take the steps
5339 	 * outlined below:
5340 	 *
5341 	 * Align the entire data buffer area to 8K and carve out per descriptor
5342 	 * data buffers starting from this 8K aligned base address.
5343 	 *
5344 	 * We round up the mtu specified to be a multiple of 2K or 4K.
5345 	 * For sizes up to 12K we round up the size to the next 2K.
5346 	 * For sizes > 12K we round up to the next 4K (otherwise sizes such as
5347 	 * 14K could end up needing 3 cookies, with the buffer spread across
5348 	 * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
5349 	 */
5350 	if (data_sz <= VNET_12K) {
5351 		data_sz = VNET_ROUNDUP_2K(data_sz);
5352 	} else {
5353 		data_sz = VNET_ROUNDUP_4K(data_sz);
5354 	}
5355 
5356 	dp->desc_data_sz = data_sz;
5357 
5358 	/* allocate extra 8K bytes for alignment */
5359 	dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
5360 	data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5361 	dp->data_addr = data_addr;
5362 
5363 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5364 	    dp->data_sz, dp->data_addr);
5365 
5366 	/* align the starting address of the data area to 8K */
5367 	data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);
5368 
5369 	tmpp = (uint64_t *)data_addr;
5370 	offset = dp->desc_data_sz/sizeof (tmpp);
5371 
5372 	/*
5373 	 * Initialise some of the private and public (if they exist)
5374 	 * descriptor fields.
5375 	 */
5376 	for (i = 0; i < vsw_ntxds; i++) {
5377 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5378 
5379 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5380 		    &priv_addr->memhandle)) != 0) {
5381 			DERR(vswp, "%s: alloc mem handle failed", name);
5382 			goto setup_ring_cleanup;
5383 		}
5384 
5385 		priv_addr->datap = (void *)tmpp;
5386 
5387 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5388 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5389 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5390 		    &(priv_addr->memcookie[0]), &ncookies);
5391 		if (rv != 0) {
5392 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5393 			    "(rv %d)", name, ldcp->ldc_id, rv);
5394 			goto setup_ring_cleanup;
5395 		}
5396 		priv_addr->bound = 1;
5397 
5398 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5399 		    name, i, priv_addr->memcookie[0].addr,
5400 		    priv_addr->memcookie[0].size);
5401 
5402 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5403 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5404 			    "invalid num of cookies (%d) for size 0x%llx",
5405 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5406 
5407 			goto setup_ring_cleanup;
5408 		} else {
5409 			for (j = 1; j < ncookies; j++) {
5410 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5411 				    &(priv_addr->memcookie[j]));
5412 				if (rv != 0) {
5413 					DERR(vswp, "%s: ldc_mem_nextcookie "
5414 					    "failed rv (%d)", name, rv);
5415 					goto setup_ring_cleanup;
5416 				}
5417 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5418 				    "size 0x%llx", name, j,
5419 				    priv_addr->memcookie[j].addr,
5420 				    priv_addr->memcookie[j].size);
5421 			}
5422 
5423 		}
5424 		priv_addr->ncookies = ncookies;
5425 		priv_addr->dstate = VIO_DESC_FREE;
5426 
5427 		if (pub_addr != NULL) {
5428 
5429 			/* link pub and private sides */
5430 			priv_addr->descp = pub_addr;
5431 
5432 			pub_addr->ncookies = priv_addr->ncookies;
5433 
5434 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5435 				bcopy(&priv_addr->memcookie[nc],
5436 				    &pub_addr->memcookie[nc],
5437 				    sizeof (ldc_mem_cookie_t));
5438 			}
5439 
5440 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5441 			pub_addr++;
5442 		}
5443 
5444 		/*
5445 		 * move to next element in the dring and the next
5446 		 * position in the data buffer.
5447 		 */
5448 		priv_addr++;
5449 		tmpp += offset;
5450 	}
5451 
5452 	return (0);
5453 
5454 setup_ring_cleanup:
5455 	priv_addr = dp->priv_addr;
5456 
5457 	for (j = 0; j < i; j++) {
5458 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5459 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5460 
5461 		mutex_destroy(&priv_addr->dstate_lock);
5462 
5463 		priv_addr++;
5464 	}
5465 	kmem_free(dp->data_addr, dp->data_sz);
5466 
5467 	return (1);
5468 }
5469 
5470 /*
5471  * Searches the private section of a ring for a free descriptor,
5472  * starting at the location of the last free descriptor found
5473  * previously.
5474  *
5475  * Returns 0 if free descriptor is available, and updates state
5476  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5477  *
5478  * FUTURE: might need to return contiguous range of descriptors
5479  * as dring info msg assumes all will be contiguous.
5480  */
5481 static int
5482 vsw_dring_find_free_desc(dring_info_t *dringp,
5483 		vsw_private_desc_t **priv_p, int *idx)
5484 {
5485 	vsw_private_desc_t	*addr = NULL;
5486 	int			num = vsw_ntxds;
5487 	int			ret = 1;
5488 
5489 	D1(NULL, "%s enter\n", __func__);
5490 
5491 	ASSERT(dringp->priv_addr != NULL);
5492 
5493 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5494 	    __func__, dringp, dringp->end_idx);
5495 
5496 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5497 
5498 	mutex_enter(&addr->dstate_lock);
5499 	if (addr->dstate == VIO_DESC_FREE) {
5500 		addr->dstate = VIO_DESC_READY;
5501 		*priv_p = addr;
5502 		*idx = dringp->end_idx;
5503 		dringp->end_idx = (dringp->end_idx + 1) % num;
5504 		ret = 0;
5505 
5506 	}
5507 	mutex_exit(&addr->dstate_lock);
5508 
5509 	/* ring full */
5510 	if (ret == 1) {
5511 		D2(NULL, "%s: no desp free: started at %d", __func__,
5512 		    dringp->end_idx);
5513 	}
5514 
5515 	D1(NULL, "%s: exit\n", __func__);
5516 
5517 	return (ret);
5518 }
5519 
5520 /*
5521  * Map from a dring identifier to the ring itself. Returns
5522  * pointer to ring or NULL if no match found.
5523  *
5524  * Should be called with dlistrw rwlock held as reader.
5525  */
5526 static dring_info_t *
5527 vsw_ident2dring(lane_t *lane, uint64_t ident)
5528 {
5529 	dring_info_t	*dp = NULL;
5530 
5531 	if ((dp = lane->dringp) == NULL) {
5532 		return (NULL);
5533 	} else {
5534 		if (dp->ident == ident)
5535 			return (dp);
5536 
5537 		while (dp != NULL) {
5538 			if (dp->ident == ident)
5539 				break;
5540 			dp = dp->next;
5541 		}
5542 	}
5543 
5544 	return (dp);
5545 }
5546 
5547 /*
5548  * Set the default lane attributes. These are copied into
5549  * the attr msg we send to our peer. If they are not acceptable
5550  * then (currently) the handshake ends.
5551  */
5552 static void
5553 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5554 {
5555 	bzero(lp, sizeof (lane_t));
5556 
5557 	READ_ENTER(&vswp->if_lockrw);
5558 	ether_copy(&(vswp->if_addr), &(lp->addr));
5559 	RW_EXIT(&vswp->if_lockrw);
5560 
5561 	lp->mtu = vswp->max_frame_size;
5562 	lp->addr_type = ADDR_TYPE_MAC;
5563 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5564 	lp->ack_freq = 0;	/* for shared mode */
5565 	lp->seq_num = VNET_ISS;
5566 }
5567 
5568 /*
5569  * Verify that the attributes are acceptable.
5570  *
5571  * FUTURE: If some attributes are not acceptable, change them
5572  * our desired values.
5573  */
5574 static int
5575 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5576 {
5577 	int			ret = 0;
5578 	struct ether_addr	ea;
5579 	vsw_port_t		*port = ldcp->ldc_port;
5580 	lane_t			*lp = &ldcp->lane_out;
5581 
5582 	D1(NULL, "vsw_check_attr enter\n");
5583 
5584 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5585 	    (pkt->xfer_mode != lp->xfer_mode)) {
5586 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5587 		ret = 1;
5588 	}
5589 
5590 	/* Only support MAC addresses at moment. */
5591 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5592 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5593 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5594 		ret = 1;
5595 	}
5596 
5597 	/*
5598 	 * MAC address supplied by device should match that stored
5599 	 * in the vsw-port OBP node. Need to decide what to do if they
5600 	 * don't match, for the moment just warn but don't fail.
5601 	 */
5602 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5603 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5604 		DERR(NULL, "vsw_check_attr: device supplied address "
5605 		    "0x%llx doesn't match node address 0x%llx\n",
5606 		    pkt->addr, port->p_macaddr);
5607 	}
5608 
5609 	/*
5610 	 * Ack freq only makes sense in pkt mode, in shared
5611 	 * mode the ring descriptors say whether or not to
5612 	 * send back an ACK.
5613 	 */
5614 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5615 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5616 	    (VSW_VER_LT(ldcp, 1, 2) &&
5617 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5618 		if (pkt->ack_freq > 0) {
5619 			D2(NULL, "vsw_check_attr: non zero ack freq "
5620 			    " in SHM mode\n");
5621 			ret = 1;
5622 		}
5623 	}
5624 
5625 	if (VSW_VER_LT(ldcp, 1, 4)) {
5626 		/* versions < 1.4, mtu must match */
5627 		if (pkt->mtu != lp->mtu) {
5628 			D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5629 			    pkt->mtu);
5630 			ret = 1;
5631 		}
5632 	} else {
5633 		/* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
5634 		if (pkt->mtu < ETHERMAX) {
5635 			ret = 1;
5636 		}
5637 	}
5638 
5639 	D1(NULL, "vsw_check_attr exit\n");
5640 
5641 	return (ret);
5642 }
5643 
5644 /*
5645  * Returns 1 if there is a problem, 0 otherwise.
5646  */
5647 static int
5648 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5649 {
5650 	_NOTE(ARGUNUSED(pkt))
5651 
5652 	int	ret = 0;
5653 
5654 	D1(NULL, "vsw_check_dring_info enter\n");
5655 
5656 	if ((pkt->num_descriptors == 0) ||
5657 	    (pkt->descriptor_size == 0) ||
5658 	    (pkt->ncookies != 1)) {
5659 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5660 		ret = 1;
5661 	}
5662 
5663 	D1(NULL, "vsw_check_dring_info exit\n");
5664 
5665 	return (ret);
5666 }
5667 
5668 /*
5669  * Returns 1 if two memory cookies match. Otherwise returns 0.
5670  */
5671 static int
5672 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5673 {
5674 	if ((m1->addr != m2->addr) ||
5675 	    (m2->size != m2->size)) {
5676 		return (0);
5677 	} else {
5678 		return (1);
5679 	}
5680 }
5681 
5682 /*
5683  * Returns 1 if ring described in reg message matches that
5684  * described by dring_info structure. Otherwise returns 0.
5685  */
5686 static int
5687 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5688 {
5689 	if ((msg->descriptor_size != dp->descriptor_size) ||
5690 	    (msg->num_descriptors != dp->num_descriptors) ||
5691 	    (msg->ncookies != dp->ncookies) ||
5692 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5693 		return (0);
5694 	} else {
5695 		return (1);
5696 	}
5697 
5698 }
5699 
5700 /*
5701  * Reset and free all the resources associated with
5702  * the channel.
5703  */
5704 static void
5705 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5706 {
5707 	dring_info_t		*dp, *dpp;
5708 	lane_t			*lp = NULL;
5709 
5710 	ASSERT(ldcp != NULL);
5711 
5712 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5713 
5714 	if (dir == INBOUND) {
5715 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5716 		    " of channel %lld", __func__, ldcp->ldc_id);
5717 		lp = &ldcp->lane_in;
5718 	} else {
5719 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5720 		    " of channel %lld", __func__, ldcp->ldc_id);
5721 		lp = &ldcp->lane_out;
5722 	}
5723 
5724 	lp->lstate = VSW_LANE_INACTIV;
5725 	lp->seq_num = VNET_ISS;
5726 
5727 	if (lp->dringp) {
5728 		if (dir == INBOUND) {
5729 			WRITE_ENTER(&lp->dlistrw);
5730 			dp = lp->dringp;
5731 			while (dp != NULL) {
5732 				dpp = dp->next;
5733 				if (dp->handle != NULL)
5734 					(void) ldc_mem_dring_unmap(dp->handle);
5735 				kmem_free(dp, sizeof (dring_info_t));
5736 				dp = dpp;
5737 			}
5738 			RW_EXIT(&lp->dlistrw);
5739 		} else {
5740 			/*
5741 			 * unbind, destroy exported dring, free dring struct
5742 			 */
5743 			WRITE_ENTER(&lp->dlistrw);
5744 			dp = lp->dringp;
5745 			vsw_free_ring(dp);
5746 			RW_EXIT(&lp->dlistrw);
5747 		}
5748 		lp->dringp = NULL;
5749 	}
5750 
5751 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5752 }
5753 
5754 /*
5755  * Free ring and all associated resources.
5756  *
5757  * Should be called with dlistrw rwlock held as writer.
5758  */
5759 static void
5760 vsw_free_ring(dring_info_t *dp)
5761 {
5762 	vsw_private_desc_t	*paddr = NULL;
5763 	dring_info_t		*dpp;
5764 	int			i;
5765 
5766 	while (dp != NULL) {
5767 		mutex_enter(&dp->dlock);
5768 		dpp = dp->next;
5769 		if (dp->priv_addr != NULL) {
5770 			/*
5771 			 * First unbind and free the memory handles
5772 			 * stored in each descriptor within the ring.
5773 			 */
5774 			for (i = 0; i < vsw_ntxds; i++) {
5775 				paddr = (vsw_private_desc_t *)
5776 				    dp->priv_addr + i;
5777 				if (paddr->memhandle != NULL) {
5778 					if (paddr->bound == 1) {
5779 						if (ldc_mem_unbind_handle(
5780 						    paddr->memhandle) != 0) {
5781 							DERR(NULL, "error "
5782 							"unbinding handle for "
5783 							"ring 0x%llx at pos %d",
5784 							    dp, i);
5785 							continue;
5786 						}
5787 						paddr->bound = 0;
5788 					}
5789 
5790 					if (ldc_mem_free_handle(
5791 					    paddr->memhandle) != 0) {
5792 						DERR(NULL, "error freeing "
5793 						    "handle for ring 0x%llx "
5794 						    "at pos %d", dp, i);
5795 						continue;
5796 					}
5797 					paddr->memhandle = NULL;
5798 				}
5799 				mutex_destroy(&paddr->dstate_lock);
5800 			}
5801 			kmem_free(dp->priv_addr,
5802 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5803 		}
5804 
5805 		/*
5806 		 * Now unbind and destroy the ring itself.
5807 		 */
5808 		if (dp->handle != NULL) {
5809 			(void) ldc_mem_dring_unbind(dp->handle);
5810 			(void) ldc_mem_dring_destroy(dp->handle);
5811 		}
5812 
5813 		if (dp->data_addr != NULL) {
5814 			kmem_free(dp->data_addr, dp->data_sz);
5815 		}
5816 
5817 		mutex_exit(&dp->dlock);
5818 		mutex_destroy(&dp->dlock);
5819 		mutex_destroy(&dp->restart_lock);
5820 		kmem_free(dp, sizeof (dring_info_t));
5821 
5822 		dp = dpp;
5823 	}
5824 }
5825 
5826 /*
5827  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5828  * This thread is woken up by the LDC interrupt handler to process
5829  * LDC packets and receive data.
5830  */
5831 static void
5832 vsw_ldc_rx_worker(void *arg)
5833 {
5834 	callb_cpr_t	cprinfo;
5835 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5836 	vsw_t *vswp = ldcp->ldc_vswp;
5837 
5838 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5839 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5840 	    "vsw_rx_thread");
5841 	mutex_enter(&ldcp->rx_thr_lock);
5842 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5843 
5844 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5845 		/*
5846 		 * Wait until the data is received or a stop
5847 		 * request is received.
5848 		 */
5849 		while (!(ldcp->rx_thr_flags &
5850 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5851 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5852 		}
5853 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5854 
5855 		/*
5856 		 * First process the stop request.
5857 		 */
5858 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5859 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5860 			    __func__, ldcp->ldc_id);
5861 			break;
5862 		}
5863 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5864 		mutex_exit(&ldcp->rx_thr_lock);
5865 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5866 		    __func__, ldcp->ldc_id);
5867 		mutex_enter(&ldcp->ldc_cblock);
5868 		vsw_process_pkt(ldcp);
5869 		mutex_exit(&ldcp->ldc_cblock);
5870 		mutex_enter(&ldcp->rx_thr_lock);
5871 	}
5872 
5873 	/*
5874 	 * Update the run status and wakeup the thread that
5875 	 * has sent the stop request.
5876 	 */
5877 	ldcp->rx_thr_flags &= ~VSW_WTHR_STOP;
5878 	ldcp->rx_thread = NULL;
5879 	CALLB_CPR_EXIT(&cprinfo);
5880 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5881 	thread_exit();
5882 }
5883 
5884 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5885 static void
5886 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5887 {
5888 	kt_did_t	tid = 0;
5889 	vsw_t		*vswp = ldcp->ldc_vswp;
5890 
5891 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5892 	/*
5893 	 * Send a stop request by setting the stop flag and
5894 	 * wait until the receive thread stops.
5895 	 */
5896 	mutex_enter(&ldcp->rx_thr_lock);
5897 	if (ldcp->rx_thread != NULL) {
5898 		tid = ldcp->rx_thread->t_did;
5899 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5900 		cv_signal(&ldcp->rx_thr_cv);
5901 	}
5902 	mutex_exit(&ldcp->rx_thr_lock);
5903 
5904 	if (tid != 0) {
5905 		thread_join(tid);
5906 	}
5907 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5908 }
5909 
5910 /*
5911  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5912  * This thread is woken up by the vsw_portsend to transmit
5913  * packets.
5914  */
5915 static void
5916 vsw_ldc_tx_worker(void *arg)
5917 {
5918 	callb_cpr_t	cprinfo;
5919 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5920 	vsw_t *vswp = ldcp->ldc_vswp;
5921 	mblk_t *mp;
5922 	mblk_t *tmp;
5923 
5924 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5925 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5926 	    "vnet_tx_thread");
5927 	mutex_enter(&ldcp->tx_thr_lock);
5928 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5929 
5930 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5931 		/*
5932 		 * Wait until the data is received or a stop
5933 		 * request is received.
5934 		 */
5935 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5936 		    (ldcp->tx_mhead == NULL)) {
5937 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5938 		}
5939 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5940 
5941 		/*
5942 		 * First process the stop request.
5943 		 */
5944 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5945 			D2(vswp, "%s(%lld):tx thread stopped\n",
5946 			    __func__, ldcp->ldc_id);
5947 			break;
5948 		}
5949 		mp = ldcp->tx_mhead;
5950 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5951 		ldcp->tx_cnt = 0;
5952 		mutex_exit(&ldcp->tx_thr_lock);
5953 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5954 		    __func__, ldcp->ldc_id);
5955 		while (mp != NULL) {
5956 			tmp = mp->b_next;
5957 			mp->b_next = mp->b_prev = NULL;
5958 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5959 			mp = tmp;
5960 		}
5961 		mutex_enter(&ldcp->tx_thr_lock);
5962 	}
5963 
5964 	/*
5965 	 * Update the run status and wakeup the thread that
5966 	 * has sent the stop request.
5967 	 */
5968 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
5969 	ldcp->tx_thread = NULL;
5970 	CALLB_CPR_EXIT(&cprinfo);
5971 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5972 	thread_exit();
5973 }
5974 
5975 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5976 static void
5977 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5978 {
5979 	kt_did_t	tid = 0;
5980 	vsw_t		*vswp = ldcp->ldc_vswp;
5981 
5982 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5983 	/*
5984 	 * Send a stop request by setting the stop flag and
5985 	 * wait until the receive thread stops.
5986 	 */
5987 	mutex_enter(&ldcp->tx_thr_lock);
5988 	if (ldcp->tx_thread != NULL) {
5989 		tid = ldcp->tx_thread->t_did;
5990 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5991 		cv_signal(&ldcp->tx_thr_cv);
5992 	}
5993 	mutex_exit(&ldcp->tx_thr_lock);
5994 
5995 	if (tid != 0) {
5996 		thread_join(tid);
5997 	}
5998 
5999 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6000 }
6001 
6002 /* vsw_reclaim_dring -- reclaim descriptors */
6003 static int
6004 vsw_reclaim_dring(dring_info_t *dp, int start)
6005 {
6006 	int i, j, len;
6007 	vsw_private_desc_t *priv_addr;
6008 	vnet_public_desc_t *pub_addr;
6009 
6010 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6011 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6012 	len = dp->num_descriptors;
6013 
6014 	D2(NULL, "%s: start index %ld\n", __func__, start);
6015 
6016 	j = 0;
6017 	for (i = start; j < len; i = (i + 1) % len, j++) {
6018 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6019 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6020 
6021 		mutex_enter(&priv_addr->dstate_lock);
6022 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
6023 			mutex_exit(&priv_addr->dstate_lock);
6024 			break;
6025 		}
6026 		pub_addr->hdr.dstate = VIO_DESC_FREE;
6027 		priv_addr->dstate = VIO_DESC_FREE;
6028 		/* clear all the fields */
6029 		priv_addr->datalen = 0;
6030 		pub_addr->hdr.ack = 0;
6031 		mutex_exit(&priv_addr->dstate_lock);
6032 
6033 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
6034 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
6035 	}
6036 	return (j);
6037 }
6038 
6039 /*
6040  * Debugging routines
6041  */
6042 static void
6043 display_state(void)
6044 {
6045 	vsw_t		*vswp;
6046 	vsw_port_list_t	*plist;
6047 	vsw_port_t 	*port;
6048 	vsw_ldc_list_t	*ldcl;
6049 	vsw_ldc_t 	*ldcp;
6050 	extern vsw_t 	*vsw_head;
6051 
6052 	cmn_err(CE_NOTE, "***** system state *****");
6053 
6054 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6055 		plist = &vswp->plist;
6056 		READ_ENTER(&plist->lockrw);
6057 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6058 		    vswp->instance, plist->num_ports);
6059 
6060 		for (port = plist->head; port != NULL; port = port->p_next) {
6061 			ldcl = &port->p_ldclist;
6062 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6063 			    port->p_instance, port->num_ldcs);
6064 			READ_ENTER(&ldcl->lockrw);
6065 			ldcp = ldcl->head;
6066 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6067 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6068 				    "status %d : phase %u\n",
6069 				    ldcp->ldc_id, ldcp->dev_class,
6070 				    ldcp->ldc_status, ldcp->hphase);
6071 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6072 				    "psession %lu\n", ldcp->ldc_id,
6073 				    ldcp->local_session, ldcp->peer_session);
6074 
6075 				cmn_err(CE_CONT, "Inbound lane:\n");
6076 				display_lane(&ldcp->lane_in);
6077 				cmn_err(CE_CONT, "Outbound lane:\n");
6078 				display_lane(&ldcp->lane_out);
6079 			}
6080 			RW_EXIT(&ldcl->lockrw);
6081 		}
6082 		RW_EXIT(&plist->lockrw);
6083 	}
6084 	cmn_err(CE_NOTE, "***** system state *****");
6085 }
6086 
6087 static void
6088 display_lane(lane_t *lp)
6089 {
6090 	dring_info_t	*drp;
6091 
6092 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6093 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6094 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6095 	    lp->addr_type, lp->addr, lp->xfer_mode);
6096 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6097 
6098 	cmn_err(CE_CONT, "Dring info:\n");
6099 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6100 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6101 		    drp->num_descriptors, drp->descriptor_size);
6102 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6103 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6104 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6105 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6106 		    drp->ident, drp->end_idx);
6107 		display_ring(drp);
6108 	}
6109 }
6110 
6111 static void
6112 display_ring(dring_info_t *dringp)
6113 {
6114 	uint64_t		i;
6115 	uint64_t		priv_count = 0;
6116 	uint64_t		pub_count = 0;
6117 	vnet_public_desc_t	*pub_addr = NULL;
6118 	vsw_private_desc_t	*priv_addr = NULL;
6119 
6120 	for (i = 0; i < vsw_ntxds; i++) {
6121 		if (dringp->pub_addr != NULL) {
6122 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6123 
6124 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6125 				pub_count++;
6126 		}
6127 
6128 		if (dringp->priv_addr != NULL) {
6129 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6130 
6131 			if (priv_addr->dstate == VIO_DESC_FREE)
6132 				priv_count++;
6133 		}
6134 	}
6135 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6136 	    i, priv_count, pub_count);
6137 }
6138 
6139 static void
6140 dump_flags(uint64_t state)
6141 {
6142 	int	i;
6143 
6144 	typedef struct flag_name {
6145 		int	flag_val;
6146 		char	*flag_name;
6147 	} flag_name_t;
6148 
6149 	flag_name_t	flags[] = {
6150 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6151 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6152 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6153 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6154 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6155 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6156 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6157 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6158 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6159 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6160 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6161 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6162 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6163 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6164 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6165 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6166 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6167 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6168 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6169 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6170 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6171 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6172 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6173 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6174 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6175 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6176 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6177 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6178 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6179 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6180 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6181 
6182 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6183 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6184 		if (state & flags[i].flag_val)
6185 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6186 	}
6187 }
6188