xref: /illumos-gate/usr/src/uts/common/os/sunmdi.c (revision 179c3dac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
28  * detailed discussion of the overall mpxio architecture.
29  *
30  * Default locking order:
31  *
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 #include <sys/disp.h>
69 #include <sys/autoconf.h>
70 #include <sys/sysmacros.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(level, stmnt) \
77 	    if (mdi_debug >= (level)) i_mdi_log stmnt
78 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
79 #else	/* !DEBUG */
80 #define	MDI_DEBUG(level, stmnt)
81 #endif	/* DEBUG */
82 
83 extern pri_t	minclsyspri;
84 extern int	modrootloaded;
85 
86 /*
87  * Global mutex:
88  * Protects vHCI list and structure members.
89  */
90 kmutex_t	mdi_mutex;
91 
92 /*
93  * Registered vHCI class driver lists
94  */
95 int		mdi_vhci_count;
96 mdi_vhci_t	*mdi_vhci_head;
97 mdi_vhci_t	*mdi_vhci_tail;
98 
99 /*
100  * Client Hash Table size
101  */
102 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
103 
104 /*
105  * taskq interface definitions
106  */
107 #define	MDI_TASKQ_N_THREADS	8
108 #define	MDI_TASKQ_PRI		minclsyspri
109 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
110 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
111 
112 taskq_t				*mdi_taskq;
113 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
114 
115 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
116 
117 /*
118  * The data should be "quiet" for this interval (in seconds) before the
119  * vhci cached data is flushed to the disk.
120  */
121 static int mdi_vhcache_flush_delay = 10;
122 
123 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
124 static int mdi_vhcache_flush_daemon_idle_time = 60;
125 
126 /*
127  * MDI falls back to discovery of all paths when a bus_config_one fails.
128  * The following parameters can be used to tune this operation.
129  *
130  * mdi_path_discovery_boot
131  *	Number of times path discovery will be attempted during early boot.
132  *	Probably there is no reason to ever set this value to greater than one.
133  *
134  * mdi_path_discovery_postboot
135  *	Number of times path discovery will be attempted after early boot.
136  *	Set it to a minimum of two to allow for discovery of iscsi paths which
137  *	may happen very late during booting.
138  *
139  * mdi_path_discovery_interval
140  *	Minimum number of seconds MDI will wait between successive discovery
141  *	of all paths. Set it to -1 to disable discovery of all paths.
142  */
143 static int mdi_path_discovery_boot = 1;
144 static int mdi_path_discovery_postboot = 2;
145 static int mdi_path_discovery_interval = 10;
146 
147 /*
148  * number of seconds the asynchronous configuration thread will sleep idle
149  * before exiting.
150  */
151 static int mdi_async_config_idle_time = 600;
152 
153 static int mdi_bus_config_cache_hash_size = 256;
154 
155 /* turns off multithreaded configuration for certain operations */
156 static int mdi_mtc_off = 0;
157 
158 /*
159  * The "path" to a pathinfo node is identical to the /devices path to a
160  * devinfo node had the device been enumerated under a pHCI instead of
161  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
162  * This association persists across create/delete of the pathinfo nodes,
163  * but not across reboot.
164  */
165 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
166 static int		mdi_pathmap_hash_size = 256;
167 static kmutex_t		mdi_pathmap_mutex;
168 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
169 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
170 
171 /*
172  * MDI component property name/value string definitions
173  */
174 const char 		*mdi_component_prop = "mpxio-component";
175 const char		*mdi_component_prop_vhci = "vhci";
176 const char		*mdi_component_prop_phci = "phci";
177 const char		*mdi_component_prop_client = "client";
178 
179 /*
180  * MDI client global unique identifier property name
181  */
182 const char		*mdi_client_guid_prop = "client-guid";
183 
184 /*
185  * MDI client load balancing property name/value string definitions
186  */
187 const char		*mdi_load_balance = "load-balance";
188 const char		*mdi_load_balance_none = "none";
189 const char		*mdi_load_balance_rr = "round-robin";
190 const char		*mdi_load_balance_lba = "logical-block";
191 
192 /*
193  * Obsolete vHCI class definition; to be removed after Leadville update
194  */
195 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
196 
197 static char vhci_greeting[] =
198 	"\tThere already exists one vHCI driver for class %s\n"
199 	"\tOnly one vHCI driver for each class is allowed\n";
200 
201 /*
202  * Static function prototypes
203  */
204 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
205 static int		i_mdi_client_offline(dev_info_t *, uint_t);
206 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
207 static void		i_mdi_phci_post_detach(dev_info_t *,
208 			    ddi_detach_cmd_t, int);
209 static int		i_mdi_client_pre_detach(dev_info_t *,
210 			    ddi_detach_cmd_t);
211 static void		i_mdi_client_post_detach(dev_info_t *,
212 			    ddi_detach_cmd_t, int);
213 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
214 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
215 static int 		i_mdi_lba_lb(mdi_client_t *ct,
216 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
217 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
218 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
219 static void		i_mdi_pm_reset_client(mdi_client_t *);
220 static int		i_mdi_power_all_phci(mdi_client_t *);
221 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
222 
223 
224 /*
225  * Internal mdi_pathinfo node functions
226  */
227 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
228 
229 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
230 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
231 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
232 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
233 static void		i_mdi_phci_unlock(mdi_phci_t *);
234 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
235 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
236 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
237 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
238 			    mdi_client_t *);
239 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
240 static void		i_mdi_client_remove_path(mdi_client_t *,
241 			    mdi_pathinfo_t *);
242 
243 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
244 			    mdi_pathinfo_state_t, int);
245 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
246 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
247 			    char **, int);
248 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
249 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
250 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
251 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
252 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
253 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
254 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
255 static void		i_mdi_client_update_state(mdi_client_t *);
256 static int		i_mdi_client_compute_state(mdi_client_t *,
257 			    mdi_phci_t *);
258 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
259 static void		i_mdi_client_unlock(mdi_client_t *);
260 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
261 static mdi_client_t	*i_devi_get_client(dev_info_t *);
262 /*
263  * NOTE: this will be removed once the NWS files are changed to use the new
264  * mdi_{enable,disable}_path interfaces
265  */
266 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
267 				int, int);
268 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
269 				mdi_vhci_t *vh, int flags, int op);
270 /*
271  * Failover related function prototypes
272  */
273 static int		i_mdi_failover(void *);
274 
275 /*
276  * misc internal functions
277  */
278 static int		i_mdi_get_hash_key(char *);
279 static int		i_map_nvlist_error_to_mdi(int);
280 static void		i_mdi_report_path_state(mdi_client_t *,
281 			    mdi_pathinfo_t *);
282 
283 static void		setup_vhci_cache(mdi_vhci_t *);
284 static int		destroy_vhci_cache(mdi_vhci_t *);
285 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
286 static boolean_t	stop_vhcache_flush_thread(void *, int);
287 static void		free_string_array(char **, int);
288 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
289 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
290 static void		free_vhcache_client(mdi_vhcache_client_t *);
291 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
292 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
293 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
294 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
295 static void		vhcache_pi_add(mdi_vhci_config_t *,
296 			    struct mdi_pathinfo *);
297 static void		vhcache_pi_remove(mdi_vhci_config_t *,
298 			    struct mdi_pathinfo *);
299 static void		free_phclient_path_list(mdi_phys_path_t *);
300 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
301 static int		flush_vhcache(mdi_vhci_config_t *, int);
302 static void		vhcache_dirty(mdi_vhci_config_t *);
303 static void		free_async_client_config(mdi_async_client_config_t *);
304 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
305 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
306 static nvlist_t		*read_on_disk_vhci_cache(char *);
307 extern int		fread_nvlist(char *, nvlist_t **);
308 extern int		fwrite_nvlist(char *, nvlist_t *);
309 
310 /* called once when first vhci registers with mdi */
311 static void
312 i_mdi_init()
313 {
314 	static int initialized = 0;
315 
316 	if (initialized)
317 		return;
318 	initialized = 1;
319 
320 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
321 
322 	/* Create our taskq resources */
323 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
324 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
325 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
326 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
327 
328 	/* Allocate ['path_instance' <-> "path"] maps */
329 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
330 	mdi_pathmap_bypath = mod_hash_create_strhash(
331 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
332 	    mod_hash_null_valdtor);
333 	mdi_pathmap_byinstance = mod_hash_create_idhash(
334 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
335 	    mod_hash_null_valdtor);
336 }
337 
338 /*
339  * mdi_get_component_type():
340  *		Return mpxio component type
341  * Return Values:
342  *		MDI_COMPONENT_NONE
343  *		MDI_COMPONENT_VHCI
344  *		MDI_COMPONENT_PHCI
345  *		MDI_COMPONENT_CLIENT
346  * XXX This doesn't work under multi-level MPxIO and should be
347  *	removed when clients migrate mdi_component_is_*() interfaces.
348  */
349 int
350 mdi_get_component_type(dev_info_t *dip)
351 {
352 	return (DEVI(dip)->devi_mdi_component);
353 }
354 
355 /*
356  * mdi_vhci_register():
357  *		Register a vHCI module with the mpxio framework
358  *		mdi_vhci_register() is called by vHCI drivers to register the
359  *		'class_driver' vHCI driver and its MDI entrypoints with the
360  *		mpxio framework.  The vHCI driver must call this interface as
361  *		part of its attach(9e) handler.
362  *		Competing threads may try to attach mdi_vhci_register() as
363  *		the vHCI drivers are loaded and attached as a result of pHCI
364  *		driver instance registration (mdi_phci_register()) with the
365  *		framework.
366  * Return Values:
367  *		MDI_SUCCESS
368  *		MDI_FAILURE
369  */
370 /*ARGSUSED*/
371 int
372 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
373     int flags)
374 {
375 	mdi_vhci_t		*vh = NULL;
376 
377 	/* Registrant can't be older */
378 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
379 
380 #ifdef DEBUG
381 	/*
382 	 * IB nexus driver is loaded only when IB hardware is present.
383 	 * In order to be able to do this there is a need to drive the loading
384 	 * and attaching of the IB nexus driver (especially when an IB hardware
385 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
386 	 * is being attached. Unfortunately this gets into the limitations
387 	 * of devfs as there seems to be no clean way to drive configuration
388 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
389 	 * for IB.
390 	 */
391 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
392 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
393 #endif
394 
395 	i_mdi_init();
396 
397 	mutex_enter(&mdi_mutex);
398 	/*
399 	 * Scan for already registered vhci
400 	 */
401 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
402 		if (strcmp(vh->vh_class, class) == 0) {
403 			/*
404 			 * vHCI has already been created.  Check for valid
405 			 * vHCI ops registration.  We only support one vHCI
406 			 * module per class
407 			 */
408 			if (vh->vh_ops != NULL) {
409 				mutex_exit(&mdi_mutex);
410 				cmn_err(CE_NOTE, vhci_greeting, class);
411 				return (MDI_FAILURE);
412 			}
413 			break;
414 		}
415 	}
416 
417 	/*
418 	 * if not yet created, create the vHCI component
419 	 */
420 	if (vh == NULL) {
421 		struct client_hash	*hash = NULL;
422 		char			*load_balance;
423 
424 		/*
425 		 * Allocate and initialize the mdi extensions
426 		 */
427 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
428 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
429 		    KM_SLEEP);
430 		vh->vh_client_table = hash;
431 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
432 		(void) strcpy(vh->vh_class, class);
433 		vh->vh_lb = LOAD_BALANCE_RR;
434 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
435 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
436 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
437 				vh->vh_lb = LOAD_BALANCE_NONE;
438 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
439 				    == 0) {
440 				vh->vh_lb = LOAD_BALANCE_LBA;
441 			}
442 			ddi_prop_free(load_balance);
443 		}
444 
445 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
446 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
447 
448 		/*
449 		 * Store the vHCI ops vectors
450 		 */
451 		vh->vh_dip = vdip;
452 		vh->vh_ops = vops;
453 
454 		setup_vhci_cache(vh);
455 
456 		if (mdi_vhci_head == NULL) {
457 			mdi_vhci_head = vh;
458 		}
459 		if (mdi_vhci_tail) {
460 			mdi_vhci_tail->vh_next = vh;
461 		}
462 		mdi_vhci_tail = vh;
463 		mdi_vhci_count++;
464 	}
465 
466 	/*
467 	 * Claim the devfs node as a vhci component
468 	 */
469 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
470 
471 	/*
472 	 * Initialize our back reference from dev_info node
473 	 */
474 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
475 	mutex_exit(&mdi_mutex);
476 	return (MDI_SUCCESS);
477 }
478 
479 /*
480  * mdi_vhci_unregister():
481  *		Unregister a vHCI module from mpxio framework
482  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
483  * 		of a vhci to unregister it from the framework.
484  * Return Values:
485  *		MDI_SUCCESS
486  *		MDI_FAILURE
487  */
488 /*ARGSUSED*/
489 int
490 mdi_vhci_unregister(dev_info_t *vdip, int flags)
491 {
492 	mdi_vhci_t	*found, *vh, *prev = NULL;
493 
494 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
495 
496 	/*
497 	 * Check for invalid VHCI
498 	 */
499 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
500 		return (MDI_FAILURE);
501 
502 	/*
503 	 * Scan the list of registered vHCIs for a match
504 	 */
505 	mutex_enter(&mdi_mutex);
506 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
507 		if (found == vh)
508 			break;
509 		prev = found;
510 	}
511 
512 	if (found == NULL) {
513 		mutex_exit(&mdi_mutex);
514 		return (MDI_FAILURE);
515 	}
516 
517 	/*
518 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
519 	 * should have been unregistered, before a vHCI can be
520 	 * unregistered.
521 	 */
522 	MDI_VHCI_PHCI_LOCK(vh);
523 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
524 		MDI_VHCI_PHCI_UNLOCK(vh);
525 		mutex_exit(&mdi_mutex);
526 		return (MDI_FAILURE);
527 	}
528 	MDI_VHCI_PHCI_UNLOCK(vh);
529 
530 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
531 		mutex_exit(&mdi_mutex);
532 		return (MDI_FAILURE);
533 	}
534 
535 	/*
536 	 * Remove the vHCI from the global list
537 	 */
538 	if (vh == mdi_vhci_head) {
539 		mdi_vhci_head = vh->vh_next;
540 	} else {
541 		prev->vh_next = vh->vh_next;
542 	}
543 	if (vh == mdi_vhci_tail) {
544 		mdi_vhci_tail = prev;
545 	}
546 	mdi_vhci_count--;
547 	mutex_exit(&mdi_mutex);
548 
549 	vh->vh_ops = NULL;
550 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
551 	DEVI(vdip)->devi_mdi_xhci = NULL;
552 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
553 	kmem_free(vh->vh_client_table,
554 	    mdi_client_table_size * sizeof (struct client_hash));
555 	mutex_destroy(&vh->vh_phci_mutex);
556 	mutex_destroy(&vh->vh_client_mutex);
557 
558 	kmem_free(vh, sizeof (mdi_vhci_t));
559 	return (MDI_SUCCESS);
560 }
561 
562 /*
563  * i_mdi_vhci_class2vhci():
564  *		Look for a matching vHCI module given a vHCI class name
565  * Return Values:
566  *		Handle to a vHCI component
567  *		NULL
568  */
569 static mdi_vhci_t *
570 i_mdi_vhci_class2vhci(char *class)
571 {
572 	mdi_vhci_t	*vh = NULL;
573 
574 	ASSERT(!MUTEX_HELD(&mdi_mutex));
575 
576 	mutex_enter(&mdi_mutex);
577 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
578 		if (strcmp(vh->vh_class, class) == 0) {
579 			break;
580 		}
581 	}
582 	mutex_exit(&mdi_mutex);
583 	return (vh);
584 }
585 
586 /*
587  * i_devi_get_vhci():
588  *		Utility function to get the handle to a vHCI component
589  * Return Values:
590  *		Handle to a vHCI component
591  *		NULL
592  */
593 mdi_vhci_t *
594 i_devi_get_vhci(dev_info_t *vdip)
595 {
596 	mdi_vhci_t	*vh = NULL;
597 	if (MDI_VHCI(vdip)) {
598 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
599 	}
600 	return (vh);
601 }
602 
603 /*
604  * mdi_phci_register():
605  *		Register a pHCI module with mpxio framework
606  *		mdi_phci_register() is called by pHCI drivers to register with
607  *		the mpxio framework and a specific 'class_driver' vHCI.  The
608  *		pHCI driver must call this interface as part of its attach(9e)
609  *		handler.
610  * Return Values:
611  *		MDI_SUCCESS
612  *		MDI_FAILURE
613  */
614 /*ARGSUSED*/
615 int
616 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
617 {
618 	mdi_phci_t		*ph;
619 	mdi_vhci_t		*vh;
620 	char			*data;
621 	char			*pathname;
622 
623 	/*
624 	 * Some subsystems, like fcp, perform pHCI registration from a
625 	 * different thread than the one doing the pHCI attach(9E) - the
626 	 * driver attach code is waiting for this other thread to complete.
627 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
628 	 * (indicating that some thread has done an ndi_devi_enter of parent)
629 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
630 	 */
631 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
632 
633 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
634 	(void) ddi_pathname(pdip, pathname);
635 
636 	/*
637 	 * Check for mpxio-disable property. Enable mpxio if the property is
638 	 * missing or not set to "yes".
639 	 * If the property is set to "yes" then emit a brief message.
640 	 */
641 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
642 	    &data) == DDI_SUCCESS)) {
643 		if (strcmp(data, "yes") == 0) {
644 			MDI_DEBUG(1, (CE_CONT, pdip,
645 			    "?%s (%s%d) multipath capabilities "
646 			    "disabled via %s.conf.\n", pathname,
647 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
648 			    ddi_driver_name(pdip)));
649 			ddi_prop_free(data);
650 			kmem_free(pathname, MAXPATHLEN);
651 			return (MDI_FAILURE);
652 		}
653 		ddi_prop_free(data);
654 	}
655 
656 	kmem_free(pathname, MAXPATHLEN);
657 
658 	/*
659 	 * Search for a matching vHCI
660 	 */
661 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
662 	if (vh == NULL) {
663 		return (MDI_FAILURE);
664 	}
665 
666 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
667 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
668 	ph->ph_dip = pdip;
669 	ph->ph_vhci = vh;
670 	ph->ph_next = NULL;
671 	ph->ph_unstable = 0;
672 	ph->ph_vprivate = 0;
673 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
674 
675 	MDI_PHCI_LOCK(ph);
676 	MDI_PHCI_SET_POWER_UP(ph);
677 	MDI_PHCI_UNLOCK(ph);
678 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
679 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
680 
681 	vhcache_phci_add(vh->vh_config, ph);
682 
683 	MDI_VHCI_PHCI_LOCK(vh);
684 	if (vh->vh_phci_head == NULL) {
685 		vh->vh_phci_head = ph;
686 	}
687 	if (vh->vh_phci_tail) {
688 		vh->vh_phci_tail->ph_next = ph;
689 	}
690 	vh->vh_phci_tail = ph;
691 	vh->vh_phci_count++;
692 	MDI_VHCI_PHCI_UNLOCK(vh);
693 
694 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
695 	return (MDI_SUCCESS);
696 }
697 
698 /*
699  * mdi_phci_unregister():
700  *		Unregister a pHCI module from mpxio framework
701  *		mdi_phci_unregister() is called by the pHCI drivers from their
702  *		detach(9E) handler to unregister their instances from the
703  *		framework.
704  * Return Values:
705  *		MDI_SUCCESS
706  *		MDI_FAILURE
707  */
708 /*ARGSUSED*/
709 int
710 mdi_phci_unregister(dev_info_t *pdip, int flags)
711 {
712 	mdi_vhci_t		*vh;
713 	mdi_phci_t		*ph;
714 	mdi_phci_t		*tmp;
715 	mdi_phci_t		*prev = NULL;
716 
717 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
718 
719 	ph = i_devi_get_phci(pdip);
720 	if (ph == NULL) {
721 		MDI_DEBUG(1, (CE_WARN, pdip,
722 		    "!pHCI unregister: Not a valid pHCI"));
723 		return (MDI_FAILURE);
724 	}
725 
726 	vh = ph->ph_vhci;
727 	ASSERT(vh != NULL);
728 	if (vh == NULL) {
729 		MDI_DEBUG(1, (CE_WARN, pdip,
730 		    "!pHCI unregister: Not a valid vHCI"));
731 		return (MDI_FAILURE);
732 	}
733 
734 	MDI_VHCI_PHCI_LOCK(vh);
735 	tmp = vh->vh_phci_head;
736 	while (tmp) {
737 		if (tmp == ph) {
738 			break;
739 		}
740 		prev = tmp;
741 		tmp = tmp->ph_next;
742 	}
743 
744 	if (ph == vh->vh_phci_head) {
745 		vh->vh_phci_head = ph->ph_next;
746 	} else {
747 		prev->ph_next = ph->ph_next;
748 	}
749 
750 	if (ph == vh->vh_phci_tail) {
751 		vh->vh_phci_tail = prev;
752 	}
753 
754 	vh->vh_phci_count--;
755 	MDI_VHCI_PHCI_UNLOCK(vh);
756 
757 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
758 	    ESC_DDI_INITIATOR_UNREGISTER);
759 	vhcache_phci_remove(vh->vh_config, ph);
760 	cv_destroy(&ph->ph_unstable_cv);
761 	mutex_destroy(&ph->ph_mutex);
762 	kmem_free(ph, sizeof (mdi_phci_t));
763 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
764 	DEVI(pdip)->devi_mdi_xhci = NULL;
765 	return (MDI_SUCCESS);
766 }
767 
768 /*
769  * i_devi_get_phci():
770  * 		Utility function to return the phci extensions.
771  */
772 static mdi_phci_t *
773 i_devi_get_phci(dev_info_t *pdip)
774 {
775 	mdi_phci_t	*ph = NULL;
776 
777 	if (MDI_PHCI(pdip)) {
778 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
779 	}
780 	return (ph);
781 }
782 
783 /*
784  * Single thread mdi entry into devinfo node for modifying its children.
785  * If necessary we perform an ndi_devi_enter of the vHCI before doing
786  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
787  * for the vHCI and one for the pHCI.
788  */
789 void
790 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
791 {
792 	dev_info_t	*vdip;
793 	int		vcircular, pcircular;
794 
795 	/* Verify calling context */
796 	ASSERT(MDI_PHCI(phci_dip));
797 	vdip = mdi_devi_get_vdip(phci_dip);
798 	ASSERT(vdip);			/* A pHCI always has a vHCI */
799 
800 	/*
801 	 * If pHCI is detaching then the framework has already entered the
802 	 * vHCI on a threads that went down the code path leading to
803 	 * detach_node().  This framework enter of the vHCI during pHCI
804 	 * detach is done to avoid deadlock with vHCI power management
805 	 * operations which enter the vHCI and the enter down the path
806 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
807 	 * enter of the vHCI on frameworks vHCI enter that has already
808 	 * occurred - this is OK because we know that the framework thread
809 	 * doing detach is waiting for our completion.
810 	 *
811 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
812 	 * race with detach - but we can't do that because the framework has
813 	 * already entered the parent, so we have some complexity instead.
814 	 */
815 	for (;;) {
816 		if (ndi_devi_tryenter(vdip, &vcircular)) {
817 			ASSERT(vcircular != -1);
818 			if (DEVI_IS_DETACHING(phci_dip)) {
819 				ndi_devi_exit(vdip, vcircular);
820 				vcircular = -1;
821 			}
822 			break;
823 		} else if (DEVI_IS_DETACHING(phci_dip)) {
824 			vcircular = -1;
825 			break;
826 		} else {
827 			delay(1);
828 		}
829 	}
830 
831 	ndi_devi_enter(phci_dip, &pcircular);
832 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
833 }
834 
835 /*
836  * Attempt to mdi_devi_enter.
837  */
838 int
839 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
840 {
841 	dev_info_t	*vdip;
842 	int		vcircular, pcircular;
843 
844 	/* Verify calling context */
845 	ASSERT(MDI_PHCI(phci_dip));
846 	vdip = mdi_devi_get_vdip(phci_dip);
847 	ASSERT(vdip);			/* A pHCI always has a vHCI */
848 
849 	if (ndi_devi_tryenter(vdip, &vcircular)) {
850 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
851 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
852 			return (1);	/* locked */
853 		}
854 		ndi_devi_exit(vdip, vcircular);
855 	}
856 	return (0);			/* busy */
857 }
858 
859 /*
860  * Release mdi_devi_enter or successful mdi_devi_tryenter.
861  */
862 void
863 mdi_devi_exit(dev_info_t *phci_dip, int circular)
864 {
865 	dev_info_t	*vdip;
866 	int		vcircular, pcircular;
867 
868 	/* Verify calling context */
869 	ASSERT(MDI_PHCI(phci_dip));
870 	vdip = mdi_devi_get_vdip(phci_dip);
871 	ASSERT(vdip);			/* A pHCI always has a vHCI */
872 
873 	/* extract two circular recursion values from single int */
874 	pcircular = (short)(circular & 0xFFFF);
875 	vcircular = (short)((circular >> 16) & 0xFFFF);
876 
877 	ndi_devi_exit(phci_dip, pcircular);
878 	if (vcircular != -1)
879 		ndi_devi_exit(vdip, vcircular);
880 }
881 
882 /*
883  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
884  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
885  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
886  * with vHCI power management code during path online/offline.  Each
887  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
888  * occur within the scope of an active mdi_devi_enter that establishes the
889  * circular value.
890  */
891 void
892 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
893 {
894 	int		pcircular;
895 
896 	/* Verify calling context */
897 	ASSERT(MDI_PHCI(phci_dip));
898 
899 	pcircular = (short)(circular & 0xFFFF);
900 	ndi_devi_exit(phci_dip, pcircular);
901 }
902 
903 void
904 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
905 {
906 	int		pcircular;
907 
908 	/* Verify calling context */
909 	ASSERT(MDI_PHCI(phci_dip));
910 
911 	ndi_devi_enter(phci_dip, &pcircular);
912 
913 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
914 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
915 }
916 
917 /*
918  * mdi_devi_get_vdip():
919  *		given a pHCI dip return vHCI dip
920  */
921 dev_info_t *
922 mdi_devi_get_vdip(dev_info_t *pdip)
923 {
924 	mdi_phci_t	*ph;
925 
926 	ph = i_devi_get_phci(pdip);
927 	if (ph && ph->ph_vhci)
928 		return (ph->ph_vhci->vh_dip);
929 	return (NULL);
930 }
931 
932 /*
933  * mdi_devi_pdip_entered():
934  *		Return 1 if we are vHCI and have done an ndi_devi_enter
935  *		of a pHCI
936  */
937 int
938 mdi_devi_pdip_entered(dev_info_t *vdip)
939 {
940 	mdi_vhci_t	*vh;
941 	mdi_phci_t	*ph;
942 
943 	vh = i_devi_get_vhci(vdip);
944 	if (vh == NULL)
945 		return (0);
946 
947 	MDI_VHCI_PHCI_LOCK(vh);
948 	ph = vh->vh_phci_head;
949 	while (ph) {
950 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
951 			MDI_VHCI_PHCI_UNLOCK(vh);
952 			return (1);
953 		}
954 		ph = ph->ph_next;
955 	}
956 	MDI_VHCI_PHCI_UNLOCK(vh);
957 	return (0);
958 }
959 
960 /*
961  * mdi_phci_path2devinfo():
962  * 		Utility function to search for a valid phci device given
963  *		the devfs pathname.
964  */
965 dev_info_t *
966 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
967 {
968 	char		*temp_pathname;
969 	mdi_vhci_t	*vh;
970 	mdi_phci_t	*ph;
971 	dev_info_t 	*pdip = NULL;
972 
973 	vh = i_devi_get_vhci(vdip);
974 	ASSERT(vh != NULL);
975 
976 	if (vh == NULL) {
977 		/*
978 		 * Invalid vHCI component, return failure
979 		 */
980 		return (NULL);
981 	}
982 
983 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
984 	MDI_VHCI_PHCI_LOCK(vh);
985 	ph = vh->vh_phci_head;
986 	while (ph != NULL) {
987 		pdip = ph->ph_dip;
988 		ASSERT(pdip != NULL);
989 		*temp_pathname = '\0';
990 		(void) ddi_pathname(pdip, temp_pathname);
991 		if (strcmp(temp_pathname, pathname) == 0) {
992 			break;
993 		}
994 		ph = ph->ph_next;
995 	}
996 	if (ph == NULL) {
997 		pdip = NULL;
998 	}
999 	MDI_VHCI_PHCI_UNLOCK(vh);
1000 	kmem_free(temp_pathname, MAXPATHLEN);
1001 	return (pdip);
1002 }
1003 
1004 /*
1005  * mdi_phci_get_path_count():
1006  * 		get number of path information nodes associated with a given
1007  *		pHCI device.
1008  */
1009 int
1010 mdi_phci_get_path_count(dev_info_t *pdip)
1011 {
1012 	mdi_phci_t	*ph;
1013 	int		count = 0;
1014 
1015 	ph = i_devi_get_phci(pdip);
1016 	if (ph != NULL) {
1017 		count = ph->ph_path_count;
1018 	}
1019 	return (count);
1020 }
1021 
1022 /*
1023  * i_mdi_phci_lock():
1024  *		Lock a pHCI device
1025  * Return Values:
1026  *		None
1027  * Note:
1028  *		The default locking order is:
1029  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1030  *		But there are number of situations where locks need to be
1031  *		grabbed in reverse order.  This routine implements try and lock
1032  *		mechanism depending on the requested parameter option.
1033  */
1034 static void
1035 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1036 {
1037 	if (pip) {
1038 		/* Reverse locking is requested. */
1039 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1040 			/*
1041 			 * tryenter failed. Try to grab again
1042 			 * after a small delay
1043 			 */
1044 			MDI_PI_HOLD(pip);
1045 			MDI_PI_UNLOCK(pip);
1046 			delay(1);
1047 			MDI_PI_LOCK(pip);
1048 			MDI_PI_RELE(pip);
1049 		}
1050 	} else {
1051 		MDI_PHCI_LOCK(ph);
1052 	}
1053 }
1054 
1055 /*
1056  * i_mdi_phci_unlock():
1057  *		Unlock the pHCI component
1058  */
1059 static void
1060 i_mdi_phci_unlock(mdi_phci_t *ph)
1061 {
1062 	MDI_PHCI_UNLOCK(ph);
1063 }
1064 
1065 /*
1066  * i_mdi_devinfo_create():
1067  *		create client device's devinfo node
1068  * Return Values:
1069  *		dev_info
1070  *		NULL
1071  * Notes:
1072  */
1073 static dev_info_t *
1074 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1075 	char **compatible, int ncompatible)
1076 {
1077 	dev_info_t *cdip = NULL;
1078 
1079 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1080 
1081 	/* Verify for duplicate entry */
1082 	cdip = i_mdi_devinfo_find(vh, name, guid);
1083 	ASSERT(cdip == NULL);
1084 	if (cdip) {
1085 		cmn_err(CE_WARN,
1086 		    "i_mdi_devinfo_create: client dip %p already exists",
1087 			(void *)cdip);
1088 	}
1089 
1090 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1091 	if (cdip == NULL)
1092 		goto fail;
1093 
1094 	/*
1095 	 * Create component type and Global unique identifier
1096 	 * properties
1097 	 */
1098 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1099 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1100 		goto fail;
1101 	}
1102 
1103 	/* Decorate the node with compatible property */
1104 	if (compatible &&
1105 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1106 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1107 		goto fail;
1108 	}
1109 
1110 	return (cdip);
1111 
1112 fail:
1113 	if (cdip) {
1114 		(void) ndi_prop_remove_all(cdip);
1115 		(void) ndi_devi_free(cdip);
1116 	}
1117 	return (NULL);
1118 }
1119 
1120 /*
1121  * i_mdi_devinfo_find():
1122  *		Find a matching devinfo node for given client node name
1123  *		and its guid.
1124  * Return Values:
1125  *		Handle to a dev_info node or NULL
1126  */
1127 static dev_info_t *
1128 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1129 {
1130 	char			*data;
1131 	dev_info_t 		*cdip = NULL;
1132 	dev_info_t 		*ndip = NULL;
1133 	int			circular;
1134 
1135 	ndi_devi_enter(vh->vh_dip, &circular);
1136 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1137 	while ((cdip = ndip) != NULL) {
1138 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1139 
1140 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1141 			continue;
1142 		}
1143 
1144 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1145 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1146 		    &data) != DDI_PROP_SUCCESS) {
1147 			continue;
1148 		}
1149 
1150 		if (strcmp(data, guid) != 0) {
1151 			ddi_prop_free(data);
1152 			continue;
1153 		}
1154 		ddi_prop_free(data);
1155 		break;
1156 	}
1157 	ndi_devi_exit(vh->vh_dip, circular);
1158 	return (cdip);
1159 }
1160 
1161 /*
1162  * i_mdi_devinfo_remove():
1163  *		Remove a client device node
1164  */
1165 static int
1166 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1167 {
1168 	int	rv = MDI_SUCCESS;
1169 
1170 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1171 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1172 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
1173 		if (rv != NDI_SUCCESS) {
1174 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
1175 			    " failed. cdip = %p\n", (void *)cdip));
1176 		}
1177 		/*
1178 		 * Convert to MDI error code
1179 		 */
1180 		switch (rv) {
1181 		case NDI_SUCCESS:
1182 			rv = MDI_SUCCESS;
1183 			break;
1184 		case NDI_BUSY:
1185 			rv = MDI_BUSY;
1186 			break;
1187 		default:
1188 			rv = MDI_FAILURE;
1189 			break;
1190 		}
1191 	}
1192 	return (rv);
1193 }
1194 
1195 /*
1196  * i_devi_get_client()
1197  *		Utility function to get mpxio component extensions
1198  */
1199 static mdi_client_t *
1200 i_devi_get_client(dev_info_t *cdip)
1201 {
1202 	mdi_client_t	*ct = NULL;
1203 
1204 	if (MDI_CLIENT(cdip)) {
1205 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1206 	}
1207 	return (ct);
1208 }
1209 
1210 /*
1211  * i_mdi_is_child_present():
1212  *		Search for the presence of client device dev_info node
1213  */
1214 static int
1215 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1216 {
1217 	int		rv = MDI_FAILURE;
1218 	struct dev_info	*dip;
1219 	int		circular;
1220 
1221 	ndi_devi_enter(vdip, &circular);
1222 	dip = DEVI(vdip)->devi_child;
1223 	while (dip) {
1224 		if (dip == DEVI(cdip)) {
1225 			rv = MDI_SUCCESS;
1226 			break;
1227 		}
1228 		dip = dip->devi_sibling;
1229 	}
1230 	ndi_devi_exit(vdip, circular);
1231 	return (rv);
1232 }
1233 
1234 
1235 /*
1236  * i_mdi_client_lock():
1237  *		Grab client component lock
1238  * Return Values:
1239  *		None
1240  * Note:
1241  *		The default locking order is:
1242  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1243  *		But there are number of situations where locks need to be
1244  *		grabbed in reverse order.  This routine implements try and lock
1245  *		mechanism depending on the requested parameter option.
1246  */
1247 static void
1248 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1249 {
1250 	if (pip) {
1251 		/*
1252 		 * Reverse locking is requested.
1253 		 */
1254 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1255 			/*
1256 			 * tryenter failed. Try to grab again
1257 			 * after a small delay
1258 			 */
1259 			MDI_PI_HOLD(pip);
1260 			MDI_PI_UNLOCK(pip);
1261 			delay(1);
1262 			MDI_PI_LOCK(pip);
1263 			MDI_PI_RELE(pip);
1264 		}
1265 	} else {
1266 		MDI_CLIENT_LOCK(ct);
1267 	}
1268 }
1269 
1270 /*
1271  * i_mdi_client_unlock():
1272  *		Unlock a client component
1273  */
1274 static void
1275 i_mdi_client_unlock(mdi_client_t *ct)
1276 {
1277 	MDI_CLIENT_UNLOCK(ct);
1278 }
1279 
1280 /*
1281  * i_mdi_client_alloc():
1282  * 		Allocate and initialize a client structure.  Caller should
1283  *		hold the vhci client lock.
1284  * Return Values:
1285  *		Handle to a client component
1286  */
1287 /*ARGSUSED*/
1288 static mdi_client_t *
1289 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1290 {
1291 	mdi_client_t	*ct;
1292 
1293 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1294 
1295 	/*
1296 	 * Allocate and initialize a component structure.
1297 	 */
1298 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1299 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1300 	ct->ct_hnext = NULL;
1301 	ct->ct_hprev = NULL;
1302 	ct->ct_dip = NULL;
1303 	ct->ct_vhci = vh;
1304 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1305 	(void) strcpy(ct->ct_drvname, name);
1306 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1307 	(void) strcpy(ct->ct_guid, lguid);
1308 	ct->ct_cprivate = NULL;
1309 	ct->ct_vprivate = NULL;
1310 	ct->ct_flags = 0;
1311 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1312 	MDI_CLIENT_LOCK(ct);
1313 	MDI_CLIENT_SET_OFFLINE(ct);
1314 	MDI_CLIENT_SET_DETACH(ct);
1315 	MDI_CLIENT_SET_POWER_UP(ct);
1316 	MDI_CLIENT_UNLOCK(ct);
1317 	ct->ct_failover_flags = 0;
1318 	ct->ct_failover_status = 0;
1319 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1320 	ct->ct_unstable = 0;
1321 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1322 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1323 	ct->ct_lb = vh->vh_lb;
1324 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1325 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1326 	ct->ct_path_count = 0;
1327 	ct->ct_path_head = NULL;
1328 	ct->ct_path_tail = NULL;
1329 	ct->ct_path_last = NULL;
1330 
1331 	/*
1332 	 * Add this client component to our client hash queue
1333 	 */
1334 	i_mdi_client_enlist_table(vh, ct);
1335 	return (ct);
1336 }
1337 
1338 /*
1339  * i_mdi_client_enlist_table():
1340  *		Attach the client device to the client hash table. Caller
1341  *		should hold the vhci client lock.
1342  */
1343 static void
1344 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1345 {
1346 	int 			index;
1347 	struct client_hash	*head;
1348 
1349 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1350 
1351 	index = i_mdi_get_hash_key(ct->ct_guid);
1352 	head = &vh->vh_client_table[index];
1353 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1354 	head->ct_hash_head = ct;
1355 	head->ct_hash_count++;
1356 	vh->vh_client_count++;
1357 }
1358 
1359 /*
1360  * i_mdi_client_delist_table():
1361  *		Attach the client device to the client hash table.
1362  *		Caller should hold the vhci client lock.
1363  */
1364 static void
1365 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1366 {
1367 	int			index;
1368 	char			*guid;
1369 	struct client_hash 	*head;
1370 	mdi_client_t		*next;
1371 	mdi_client_t		*last;
1372 
1373 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1374 
1375 	guid = ct->ct_guid;
1376 	index = i_mdi_get_hash_key(guid);
1377 	head = &vh->vh_client_table[index];
1378 
1379 	last = NULL;
1380 	next = (mdi_client_t *)head->ct_hash_head;
1381 	while (next != NULL) {
1382 		if (next == ct) {
1383 			break;
1384 		}
1385 		last = next;
1386 		next = next->ct_hnext;
1387 	}
1388 
1389 	if (next) {
1390 		head->ct_hash_count--;
1391 		if (last == NULL) {
1392 			head->ct_hash_head = ct->ct_hnext;
1393 		} else {
1394 			last->ct_hnext = ct->ct_hnext;
1395 		}
1396 		ct->ct_hnext = NULL;
1397 		vh->vh_client_count--;
1398 	}
1399 }
1400 
1401 
1402 /*
1403  * i_mdi_client_free():
1404  *		Free a client component
1405  */
1406 static int
1407 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1408 {
1409 	int		rv = MDI_SUCCESS;
1410 	int		flags = ct->ct_flags;
1411 	dev_info_t	*cdip;
1412 	dev_info_t	*vdip;
1413 
1414 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1415 
1416 	vdip = vh->vh_dip;
1417 	cdip = ct->ct_dip;
1418 
1419 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1420 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1421 	DEVI(cdip)->devi_mdi_client = NULL;
1422 
1423 	/*
1424 	 * Clear out back ref. to dev_info_t node
1425 	 */
1426 	ct->ct_dip = NULL;
1427 
1428 	/*
1429 	 * Remove this client from our hash queue
1430 	 */
1431 	i_mdi_client_delist_table(vh, ct);
1432 
1433 	/*
1434 	 * Uninitialize and free the component
1435 	 */
1436 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1437 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1438 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1439 	cv_destroy(&ct->ct_failover_cv);
1440 	cv_destroy(&ct->ct_unstable_cv);
1441 	cv_destroy(&ct->ct_powerchange_cv);
1442 	mutex_destroy(&ct->ct_mutex);
1443 	kmem_free(ct, sizeof (*ct));
1444 
1445 	if (cdip != NULL) {
1446 		MDI_VHCI_CLIENT_UNLOCK(vh);
1447 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1448 		MDI_VHCI_CLIENT_LOCK(vh);
1449 	}
1450 	return (rv);
1451 }
1452 
1453 /*
1454  * i_mdi_client_find():
1455  * 		Find the client structure corresponding to a given guid
1456  *		Caller should hold the vhci client lock.
1457  */
1458 static mdi_client_t *
1459 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1460 {
1461 	int			index;
1462 	struct client_hash	*head;
1463 	mdi_client_t		*ct;
1464 
1465 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1466 
1467 	index = i_mdi_get_hash_key(guid);
1468 	head = &vh->vh_client_table[index];
1469 
1470 	ct = head->ct_hash_head;
1471 	while (ct != NULL) {
1472 		if (strcmp(ct->ct_guid, guid) == 0 &&
1473 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1474 			break;
1475 		}
1476 		ct = ct->ct_hnext;
1477 	}
1478 	return (ct);
1479 }
1480 
1481 /*
1482  * i_mdi_client_update_state():
1483  *		Compute and update client device state
1484  * Notes:
1485  *		A client device can be in any of three possible states:
1486  *
1487  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1488  *		one online/standby paths. Can tolerate failures.
1489  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1490  *		no alternate paths available as standby. A failure on the online
1491  *		would result in loss of access to device data.
1492  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1493  *		no paths available to access the device.
1494  */
1495 static void
1496 i_mdi_client_update_state(mdi_client_t *ct)
1497 {
1498 	int state;
1499 
1500 	ASSERT(MDI_CLIENT_LOCKED(ct));
1501 	state = i_mdi_client_compute_state(ct, NULL);
1502 	MDI_CLIENT_SET_STATE(ct, state);
1503 }
1504 
1505 /*
1506  * i_mdi_client_compute_state():
1507  *		Compute client device state
1508  *
1509  *		mdi_phci_t *	Pointer to pHCI structure which should
1510  *				while computing the new value.  Used by
1511  *				i_mdi_phci_offline() to find the new
1512  *				client state after DR of a pHCI.
1513  */
1514 static int
1515 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1516 {
1517 	int		state;
1518 	int		online_count = 0;
1519 	int		standby_count = 0;
1520 	mdi_pathinfo_t	*pip, *next;
1521 
1522 	ASSERT(MDI_CLIENT_LOCKED(ct));
1523 	pip = ct->ct_path_head;
1524 	while (pip != NULL) {
1525 		MDI_PI_LOCK(pip);
1526 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1527 		if (MDI_PI(pip)->pi_phci == ph) {
1528 			MDI_PI_UNLOCK(pip);
1529 			pip = next;
1530 			continue;
1531 		}
1532 
1533 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1534 				== MDI_PATHINFO_STATE_ONLINE)
1535 			online_count++;
1536 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1537 				== MDI_PATHINFO_STATE_STANDBY)
1538 			standby_count++;
1539 		MDI_PI_UNLOCK(pip);
1540 		pip = next;
1541 	}
1542 
1543 	if (online_count == 0) {
1544 		if (standby_count == 0) {
1545 			state = MDI_CLIENT_STATE_FAILED;
1546 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1547 			    " ct = %p\n", (void *)ct));
1548 		} else if (standby_count == 1) {
1549 			state = MDI_CLIENT_STATE_DEGRADED;
1550 		} else {
1551 			state = MDI_CLIENT_STATE_OPTIMAL;
1552 		}
1553 	} else if (online_count == 1) {
1554 		if (standby_count == 0) {
1555 			state = MDI_CLIENT_STATE_DEGRADED;
1556 		} else {
1557 			state = MDI_CLIENT_STATE_OPTIMAL;
1558 		}
1559 	} else {
1560 		state = MDI_CLIENT_STATE_OPTIMAL;
1561 	}
1562 	return (state);
1563 }
1564 
1565 /*
1566  * i_mdi_client2devinfo():
1567  *		Utility function
1568  */
1569 dev_info_t *
1570 i_mdi_client2devinfo(mdi_client_t *ct)
1571 {
1572 	return (ct->ct_dip);
1573 }
1574 
1575 /*
1576  * mdi_client_path2_devinfo():
1577  * 		Given the parent devinfo and child devfs pathname, search for
1578  *		a valid devfs node handle.
1579  */
1580 dev_info_t *
1581 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1582 {
1583 	dev_info_t 	*cdip = NULL;
1584 	dev_info_t 	*ndip = NULL;
1585 	char		*temp_pathname;
1586 	int		circular;
1587 
1588 	/*
1589 	 * Allocate temp buffer
1590 	 */
1591 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1592 
1593 	/*
1594 	 * Lock parent against changes
1595 	 */
1596 	ndi_devi_enter(vdip, &circular);
1597 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1598 	while ((cdip = ndip) != NULL) {
1599 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1600 
1601 		*temp_pathname = '\0';
1602 		(void) ddi_pathname(cdip, temp_pathname);
1603 		if (strcmp(temp_pathname, pathname) == 0) {
1604 			break;
1605 		}
1606 	}
1607 	/*
1608 	 * Release devinfo lock
1609 	 */
1610 	ndi_devi_exit(vdip, circular);
1611 
1612 	/*
1613 	 * Free the temp buffer
1614 	 */
1615 	kmem_free(temp_pathname, MAXPATHLEN);
1616 	return (cdip);
1617 }
1618 
1619 /*
1620  * mdi_client_get_path_count():
1621  * 		Utility function to get number of path information nodes
1622  *		associated with a given client device.
1623  */
1624 int
1625 mdi_client_get_path_count(dev_info_t *cdip)
1626 {
1627 	mdi_client_t	*ct;
1628 	int		count = 0;
1629 
1630 	ct = i_devi_get_client(cdip);
1631 	if (ct != NULL) {
1632 		count = ct->ct_path_count;
1633 	}
1634 	return (count);
1635 }
1636 
1637 
1638 /*
1639  * i_mdi_get_hash_key():
1640  * 		Create a hash using strings as keys
1641  *
1642  */
1643 static int
1644 i_mdi_get_hash_key(char *str)
1645 {
1646 	uint32_t	g, hash = 0;
1647 	char		*p;
1648 
1649 	for (p = str; *p != '\0'; p++) {
1650 		g = *p;
1651 		hash += g;
1652 	}
1653 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1654 }
1655 
1656 /*
1657  * mdi_get_lb_policy():
1658  * 		Get current load balancing policy for a given client device
1659  */
1660 client_lb_t
1661 mdi_get_lb_policy(dev_info_t *cdip)
1662 {
1663 	client_lb_t	lb = LOAD_BALANCE_NONE;
1664 	mdi_client_t	*ct;
1665 
1666 	ct = i_devi_get_client(cdip);
1667 	if (ct != NULL) {
1668 		lb = ct->ct_lb;
1669 	}
1670 	return (lb);
1671 }
1672 
1673 /*
1674  * mdi_set_lb_region_size():
1675  * 		Set current region size for the load-balance
1676  */
1677 int
1678 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1679 {
1680 	mdi_client_t	*ct;
1681 	int		rv = MDI_FAILURE;
1682 
1683 	ct = i_devi_get_client(cdip);
1684 	if (ct != NULL && ct->ct_lb_args != NULL) {
1685 		ct->ct_lb_args->region_size = region_size;
1686 		rv = MDI_SUCCESS;
1687 	}
1688 	return (rv);
1689 }
1690 
1691 /*
1692  * mdi_Set_lb_policy():
1693  * 		Set current load balancing policy for a given client device
1694  */
1695 int
1696 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1697 {
1698 	mdi_client_t	*ct;
1699 	int		rv = MDI_FAILURE;
1700 
1701 	ct = i_devi_get_client(cdip);
1702 	if (ct != NULL) {
1703 		ct->ct_lb = lb;
1704 		rv = MDI_SUCCESS;
1705 	}
1706 	return (rv);
1707 }
1708 
1709 /*
1710  * mdi_failover():
1711  *		failover function called by the vHCI drivers to initiate
1712  *		a failover operation.  This is typically due to non-availability
1713  *		of online paths to route I/O requests.  Failover can be
1714  *		triggered through user application also.
1715  *
1716  *		The vHCI driver calls mdi_failover() to initiate a failover
1717  *		operation. mdi_failover() calls back into the vHCI driver's
1718  *		vo_failover() entry point to perform the actual failover
1719  *		operation.  The reason for requiring the vHCI driver to
1720  *		initiate failover by calling mdi_failover(), instead of directly
1721  *		executing vo_failover() itself, is to ensure that the mdi
1722  *		framework can keep track of the client state properly.
1723  *		Additionally, mdi_failover() provides as a convenience the
1724  *		option of performing the failover operation synchronously or
1725  *		asynchronously
1726  *
1727  *		Upon successful completion of the failover operation, the
1728  *		paths that were previously ONLINE will be in the STANDBY state,
1729  *		and the newly activated paths will be in the ONLINE state.
1730  *
1731  *		The flags modifier determines whether the activation is done
1732  *		synchronously: MDI_FAILOVER_SYNC
1733  * Return Values:
1734  *		MDI_SUCCESS
1735  *		MDI_FAILURE
1736  *		MDI_BUSY
1737  */
1738 /*ARGSUSED*/
1739 int
1740 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1741 {
1742 	int			rv;
1743 	mdi_client_t		*ct;
1744 
1745 	ct = i_devi_get_client(cdip);
1746 	ASSERT(ct != NULL);
1747 	if (ct == NULL) {
1748 		/* cdip is not a valid client device. Nothing more to do. */
1749 		return (MDI_FAILURE);
1750 	}
1751 
1752 	MDI_CLIENT_LOCK(ct);
1753 
1754 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1755 		/* A path to the client is being freed */
1756 		MDI_CLIENT_UNLOCK(ct);
1757 		return (MDI_BUSY);
1758 	}
1759 
1760 
1761 	if (MDI_CLIENT_IS_FAILED(ct)) {
1762 		/*
1763 		 * Client is in failed state. Nothing more to do.
1764 		 */
1765 		MDI_CLIENT_UNLOCK(ct);
1766 		return (MDI_FAILURE);
1767 	}
1768 
1769 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1770 		/*
1771 		 * Failover is already in progress; return BUSY
1772 		 */
1773 		MDI_CLIENT_UNLOCK(ct);
1774 		return (MDI_BUSY);
1775 	}
1776 	/*
1777 	 * Make sure that mdi_pathinfo node state changes are processed.
1778 	 * We do not allow failovers to progress while client path state
1779 	 * changes are in progress
1780 	 */
1781 	if (ct->ct_unstable) {
1782 		if (flags == MDI_FAILOVER_ASYNC) {
1783 			MDI_CLIENT_UNLOCK(ct);
1784 			return (MDI_BUSY);
1785 		} else {
1786 			while (ct->ct_unstable)
1787 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1788 		}
1789 	}
1790 
1791 	/*
1792 	 * Client device is in stable state. Before proceeding, perform sanity
1793 	 * checks again.
1794 	 */
1795 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1796 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1797 		/*
1798 		 * Client is in failed state. Nothing more to do.
1799 		 */
1800 		MDI_CLIENT_UNLOCK(ct);
1801 		return (MDI_FAILURE);
1802 	}
1803 
1804 	/*
1805 	 * Set the client state as failover in progress.
1806 	 */
1807 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1808 	ct->ct_failover_flags = flags;
1809 	MDI_CLIENT_UNLOCK(ct);
1810 
1811 	if (flags == MDI_FAILOVER_ASYNC) {
1812 		/*
1813 		 * Submit the initiate failover request via CPR safe
1814 		 * taskq threads.
1815 		 */
1816 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1817 		    ct, KM_SLEEP);
1818 		return (MDI_ACCEPT);
1819 	} else {
1820 		/*
1821 		 * Synchronous failover mode.  Typically invoked from the user
1822 		 * land.
1823 		 */
1824 		rv = i_mdi_failover(ct);
1825 	}
1826 	return (rv);
1827 }
1828 
1829 /*
1830  * i_mdi_failover():
1831  *		internal failover function. Invokes vHCI drivers failover
1832  *		callback function and process the failover status
1833  * Return Values:
1834  *		None
1835  *
1836  * Note: A client device in failover state can not be detached or freed.
1837  */
1838 static int
1839 i_mdi_failover(void *arg)
1840 {
1841 	int		rv = MDI_SUCCESS;
1842 	mdi_client_t	*ct = (mdi_client_t *)arg;
1843 	mdi_vhci_t	*vh = ct->ct_vhci;
1844 
1845 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1846 
1847 	if (vh->vh_ops->vo_failover != NULL) {
1848 		/*
1849 		 * Call vHCI drivers callback routine
1850 		 */
1851 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1852 		    ct->ct_failover_flags);
1853 	}
1854 
1855 	MDI_CLIENT_LOCK(ct);
1856 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1857 
1858 	/*
1859 	 * Save the failover return status
1860 	 */
1861 	ct->ct_failover_status = rv;
1862 
1863 	/*
1864 	 * As a result of failover, client status would have been changed.
1865 	 * Update the client state and wake up anyone waiting on this client
1866 	 * device.
1867 	 */
1868 	i_mdi_client_update_state(ct);
1869 
1870 	cv_broadcast(&ct->ct_failover_cv);
1871 	MDI_CLIENT_UNLOCK(ct);
1872 	return (rv);
1873 }
1874 
1875 /*
1876  * Load balancing is logical block.
1877  * IOs within the range described by region_size
1878  * would go on the same path. This would improve the
1879  * performance by cache-hit on some of the RAID devices.
1880  * Search only for online paths(At some point we
1881  * may want to balance across target ports).
1882  * If no paths are found then default to round-robin.
1883  */
1884 static int
1885 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1886 {
1887 	int		path_index = -1;
1888 	int		online_path_count = 0;
1889 	int		online_nonpref_path_count = 0;
1890 	int 		region_size = ct->ct_lb_args->region_size;
1891 	mdi_pathinfo_t	*pip;
1892 	mdi_pathinfo_t	*next;
1893 	int		preferred, path_cnt;
1894 
1895 	pip = ct->ct_path_head;
1896 	while (pip) {
1897 		MDI_PI_LOCK(pip);
1898 		if (MDI_PI(pip)->pi_state ==
1899 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1900 			online_path_count++;
1901 		} else if (MDI_PI(pip)->pi_state ==
1902 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1903 			online_nonpref_path_count++;
1904 		}
1905 		next = (mdi_pathinfo_t *)
1906 		    MDI_PI(pip)->pi_client_link;
1907 		MDI_PI_UNLOCK(pip);
1908 		pip = next;
1909 	}
1910 	/* if found any online/preferred then use this type */
1911 	if (online_path_count > 0) {
1912 		path_cnt = online_path_count;
1913 		preferred = 1;
1914 	} else if (online_nonpref_path_count > 0) {
1915 		path_cnt = online_nonpref_path_count;
1916 		preferred = 0;
1917 	} else {
1918 		path_cnt = 0;
1919 	}
1920 	if (path_cnt) {
1921 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1922 		pip = ct->ct_path_head;
1923 		while (pip && path_index != -1) {
1924 			MDI_PI_LOCK(pip);
1925 			if (path_index == 0 &&
1926 			    (MDI_PI(pip)->pi_state ==
1927 			    MDI_PATHINFO_STATE_ONLINE) &&
1928 				MDI_PI(pip)->pi_preferred == preferred) {
1929 				MDI_PI_HOLD(pip);
1930 				MDI_PI_UNLOCK(pip);
1931 				*ret_pip = pip;
1932 				return (MDI_SUCCESS);
1933 			}
1934 			path_index --;
1935 			next = (mdi_pathinfo_t *)
1936 			    MDI_PI(pip)->pi_client_link;
1937 			MDI_PI_UNLOCK(pip);
1938 			pip = next;
1939 		}
1940 		if (pip == NULL) {
1941 			MDI_DEBUG(4, (CE_NOTE, NULL,
1942 			    "!lba %llx, no pip !!\n",
1943 				bp->b_lblkno));
1944 		} else {
1945 			MDI_DEBUG(4, (CE_NOTE, NULL,
1946 			    "!lba %llx, no pip for path_index, "
1947 			    "pip %p\n", bp->b_lblkno, (void *)pip));
1948 		}
1949 	}
1950 	return (MDI_FAILURE);
1951 }
1952 
1953 /*
1954  * mdi_select_path():
1955  *		select a path to access a client device.
1956  *
1957  *		mdi_select_path() function is called by the vHCI drivers to
1958  *		select a path to route the I/O request to.  The caller passes
1959  *		the block I/O data transfer structure ("buf") as one of the
1960  *		parameters.  The mpxio framework uses the buf structure
1961  *		contents to maintain per path statistics (total I/O size /
1962  *		count pending).  If more than one online paths are available to
1963  *		select, the framework automatically selects a suitable path
1964  *		for routing I/O request. If a failover operation is active for
1965  *		this client device the call shall be failed with MDI_BUSY error
1966  *		code.
1967  *
1968  *		By default this function returns a suitable path in online
1969  *		state based on the current load balancing policy.  Currently
1970  *		we support LOAD_BALANCE_NONE (Previously selected online path
1971  *		will continue to be used till the path is usable) and
1972  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1973  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1974  *		based on the logical block).  The load balancing
1975  *		through vHCI drivers configuration file (driver.conf).
1976  *
1977  *		vHCI drivers may override this default behavior by specifying
1978  *		appropriate flags.  The meaning of the thrid argument depends
1979  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
1980  *		then the argument is the "path instance" of the path to select.
1981  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
1982  *		"start_pip". A non NULL "start_pip" is the starting point to
1983  *		walk and find the next appropriate path.  The following values
1984  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
1985  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
1986  *		STANDBY path).
1987  *
1988  *		The non-standard behavior is used by the scsi_vhci driver,
1989  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1990  *		attach of client devices (to avoid an unnecessary failover
1991  *		when the STANDBY path comes up first), during failover
1992  *		(to activate a STANDBY path as ONLINE).
1993  *
1994  *		The selected path is returned in a a mdi_hold_path() state
1995  *		(pi_ref_cnt). Caller should release the hold by calling
1996  *		mdi_rele_path().
1997  *
1998  * Return Values:
1999  *		MDI_SUCCESS	- Completed successfully
2000  *		MDI_BUSY 	- Client device is busy failing over
2001  *		MDI_NOPATH	- Client device is online, but no valid path are
2002  *				  available to access this client device
2003  *		MDI_FAILURE	- Invalid client device or state
2004  *		MDI_DEVI_ONLINING
2005  *				- Client device (struct dev_info state) is in
2006  *				  onlining state.
2007  */
2008 
2009 /*ARGSUSED*/
2010 int
2011 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2012     void *arg, mdi_pathinfo_t **ret_pip)
2013 {
2014 	mdi_client_t	*ct;
2015 	mdi_pathinfo_t	*pip;
2016 	mdi_pathinfo_t	*next;
2017 	mdi_pathinfo_t	*head;
2018 	mdi_pathinfo_t	*start;
2019 	client_lb_t	lbp;	/* load balancing policy */
2020 	int		sb = 1;	/* standard behavior */
2021 	int		preferred = 1;	/* preferred path */
2022 	int		cond, cont = 1;
2023 	int		retry = 0;
2024 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2025 	int		path_instance;	/* request specific path instance */
2026 
2027 	/* determine type of arg based on flags */
2028 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2029 		flags &= ~MDI_SELECT_PATH_INSTANCE;
2030 		path_instance = (int)(intptr_t)arg;
2031 		start_pip = NULL;
2032 	} else {
2033 		path_instance = 0;
2034 		start_pip = (mdi_pathinfo_t *)arg;
2035 	}
2036 
2037 	if (flags != 0) {
2038 		/*
2039 		 * disable default behavior
2040 		 */
2041 		sb = 0;
2042 	}
2043 
2044 	*ret_pip = NULL;
2045 	ct = i_devi_get_client(cdip);
2046 	if (ct == NULL) {
2047 		/* mdi extensions are NULL, Nothing more to do */
2048 		return (MDI_FAILURE);
2049 	}
2050 
2051 	MDI_CLIENT_LOCK(ct);
2052 
2053 	if (sb) {
2054 		if (MDI_CLIENT_IS_FAILED(ct)) {
2055 			/*
2056 			 * Client is not ready to accept any I/O requests.
2057 			 * Fail this request.
2058 			 */
2059 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
2060 			    "client state offline ct = %p\n", (void *)ct));
2061 			MDI_CLIENT_UNLOCK(ct);
2062 			return (MDI_FAILURE);
2063 		}
2064 
2065 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2066 			/*
2067 			 * Check for Failover is in progress. If so tell the
2068 			 * caller that this device is busy.
2069 			 */
2070 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
2071 			    "client failover in progress ct = %p\n",
2072 			    (void *)ct));
2073 			MDI_CLIENT_UNLOCK(ct);
2074 			return (MDI_BUSY);
2075 		}
2076 
2077 		/*
2078 		 * Check to see whether the client device is attached.
2079 		 * If not so, let the vHCI driver manually select a path
2080 		 * (standby) and let the probe/attach process to continue.
2081 		 */
2082 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2083 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining "
2084 			    "ct = %p\n", (void *)ct));
2085 			MDI_CLIENT_UNLOCK(ct);
2086 			return (MDI_DEVI_ONLINING);
2087 		}
2088 	}
2089 
2090 	/*
2091 	 * Cache in the client list head.  If head of the list is NULL
2092 	 * return MDI_NOPATH
2093 	 */
2094 	head = ct->ct_path_head;
2095 	if (head == NULL) {
2096 		MDI_CLIENT_UNLOCK(ct);
2097 		return (MDI_NOPATH);
2098 	}
2099 
2100 	/* Caller is specifying a specific pathinfo path by path_instance */
2101 	if (path_instance) {
2102 		/* search for pathinfo with correct path_instance */
2103 		for (pip = head;
2104 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2105 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2106 			;
2107 
2108 		/* If path can't be selected then MDI_FAILURE is returned. */
2109 		if (pip == NULL) {
2110 			MDI_CLIENT_UNLOCK(ct);
2111 			return (MDI_FAILURE);
2112 		}
2113 
2114 		/* verify state of path */
2115 		MDI_PI_LOCK(pip);
2116 		if (MDI_PI(pip)->pi_state != MDI_PATHINFO_STATE_ONLINE) {
2117 			MDI_PI_UNLOCK(pip);
2118 			MDI_CLIENT_UNLOCK(ct);
2119 			return (MDI_FAILURE);
2120 		}
2121 
2122 		/*
2123 		 * Return the path in hold state. Caller should release the
2124 		 * lock by calling mdi_rele_path()
2125 		 */
2126 		MDI_PI_HOLD(pip);
2127 		MDI_PI_UNLOCK(pip);
2128 		ct->ct_path_last = pip;
2129 		*ret_pip = pip;
2130 		MDI_CLIENT_UNLOCK(ct);
2131 		return (MDI_SUCCESS);
2132 	}
2133 
2134 	/*
2135 	 * for non default behavior, bypass current
2136 	 * load balancing policy and always use LOAD_BALANCE_RR
2137 	 * except that the start point will be adjusted based
2138 	 * on the provided start_pip
2139 	 */
2140 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2141 
2142 	switch (lbp) {
2143 	case LOAD_BALANCE_NONE:
2144 		/*
2145 		 * Load balancing is None  or Alternate path mode
2146 		 * Start looking for a online mdi_pathinfo node starting from
2147 		 * last known selected path
2148 		 */
2149 		preferred = 1;
2150 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2151 		if (pip == NULL) {
2152 			pip = head;
2153 		}
2154 		start = pip;
2155 		do {
2156 			MDI_PI_LOCK(pip);
2157 			/*
2158 			 * No need to explicitly check if the path is disabled.
2159 			 * Since we are checking for state == ONLINE and the
2160 			 * same variable is used for DISABLE/ENABLE information.
2161 			 */
2162 			if ((MDI_PI(pip)->pi_state  ==
2163 				MDI_PATHINFO_STATE_ONLINE) &&
2164 				preferred == MDI_PI(pip)->pi_preferred) {
2165 				/*
2166 				 * Return the path in hold state. Caller should
2167 				 * release the lock by calling mdi_rele_path()
2168 				 */
2169 				MDI_PI_HOLD(pip);
2170 				MDI_PI_UNLOCK(pip);
2171 				ct->ct_path_last = pip;
2172 				*ret_pip = pip;
2173 				MDI_CLIENT_UNLOCK(ct);
2174 				return (MDI_SUCCESS);
2175 			}
2176 
2177 			/*
2178 			 * Path is busy.
2179 			 */
2180 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2181 			    MDI_PI_IS_TRANSIENT(pip))
2182 				retry = 1;
2183 			/*
2184 			 * Keep looking for a next available online path
2185 			 */
2186 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2187 			if (next == NULL) {
2188 				next = head;
2189 			}
2190 			MDI_PI_UNLOCK(pip);
2191 			pip = next;
2192 			if (start == pip && preferred) {
2193 				preferred = 0;
2194 			} else if (start == pip && !preferred) {
2195 				cont = 0;
2196 			}
2197 		} while (cont);
2198 		break;
2199 
2200 	case LOAD_BALANCE_LBA:
2201 		/*
2202 		 * Make sure we are looking
2203 		 * for an online path. Otherwise, if it is for a STANDBY
2204 		 * path request, it will go through and fetch an ONLINE
2205 		 * path which is not desirable.
2206 		 */
2207 		if ((ct->ct_lb_args != NULL) &&
2208 			    (ct->ct_lb_args->region_size) && bp &&
2209 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2210 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2211 				    == MDI_SUCCESS) {
2212 				MDI_CLIENT_UNLOCK(ct);
2213 				return (MDI_SUCCESS);
2214 			}
2215 		}
2216 		/*  FALLTHROUGH */
2217 	case LOAD_BALANCE_RR:
2218 		/*
2219 		 * Load balancing is Round Robin. Start looking for a online
2220 		 * mdi_pathinfo node starting from last known selected path
2221 		 * as the start point.  If override flags are specified,
2222 		 * process accordingly.
2223 		 * If the search is already in effect(start_pip not null),
2224 		 * then lets just use the same path preference to continue the
2225 		 * traversal.
2226 		 */
2227 
2228 		if (start_pip != NULL) {
2229 			preferred = MDI_PI(start_pip)->pi_preferred;
2230 		} else {
2231 			preferred = 1;
2232 		}
2233 
2234 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2235 		if (start == NULL) {
2236 			pip = head;
2237 		} else {
2238 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2239 			if (pip == NULL) {
2240 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2241 					/*
2242 					 * Return since we hit the end of list
2243 					 */
2244 					MDI_CLIENT_UNLOCK(ct);
2245 					return (MDI_NOPATH);
2246 				}
2247 
2248 				if (!sb) {
2249 					if (preferred == 0) {
2250 						/*
2251 						 * Looks like we have completed
2252 						 * the traversal as preferred
2253 						 * value is 0. Time to bail out.
2254 						 */
2255 						*ret_pip = NULL;
2256 						MDI_CLIENT_UNLOCK(ct);
2257 						return (MDI_NOPATH);
2258 					} else {
2259 						/*
2260 						 * Looks like we reached the
2261 						 * end of the list. Lets enable
2262 						 * traversal of non preferred
2263 						 * paths.
2264 						 */
2265 						preferred = 0;
2266 					}
2267 				}
2268 				pip = head;
2269 			}
2270 		}
2271 		start = pip;
2272 		do {
2273 			MDI_PI_LOCK(pip);
2274 			if (sb) {
2275 				cond = ((MDI_PI(pip)->pi_state ==
2276 				    MDI_PATHINFO_STATE_ONLINE &&
2277 					MDI_PI(pip)->pi_preferred ==
2278 						preferred) ? 1 : 0);
2279 			} else {
2280 				if (flags == MDI_SELECT_ONLINE_PATH) {
2281 					cond = ((MDI_PI(pip)->pi_state ==
2282 					    MDI_PATHINFO_STATE_ONLINE &&
2283 						MDI_PI(pip)->pi_preferred ==
2284 						preferred) ? 1 : 0);
2285 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2286 					cond = ((MDI_PI(pip)->pi_state ==
2287 					    MDI_PATHINFO_STATE_STANDBY &&
2288 						MDI_PI(pip)->pi_preferred ==
2289 						preferred) ? 1 : 0);
2290 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2291 				    MDI_SELECT_STANDBY_PATH)) {
2292 					cond = (((MDI_PI(pip)->pi_state ==
2293 					    MDI_PATHINFO_STATE_ONLINE ||
2294 					    (MDI_PI(pip)->pi_state ==
2295 					    MDI_PATHINFO_STATE_STANDBY)) &&
2296 						MDI_PI(pip)->pi_preferred ==
2297 						preferred) ? 1 : 0);
2298 				} else if (flags ==
2299 					(MDI_SELECT_STANDBY_PATH |
2300 					MDI_SELECT_ONLINE_PATH |
2301 					MDI_SELECT_USER_DISABLE_PATH)) {
2302 					cond = (((MDI_PI(pip)->pi_state ==
2303 					    MDI_PATHINFO_STATE_ONLINE ||
2304 					    (MDI_PI(pip)->pi_state ==
2305 					    MDI_PATHINFO_STATE_STANDBY) ||
2306 						(MDI_PI(pip)->pi_state ==
2307 					    (MDI_PATHINFO_STATE_ONLINE|
2308 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2309 						(MDI_PI(pip)->pi_state ==
2310 					    (MDI_PATHINFO_STATE_STANDBY |
2311 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2312 						MDI_PI(pip)->pi_preferred ==
2313 						preferred) ? 1 : 0);
2314 				} else if (flags ==
2315 				    (MDI_SELECT_STANDBY_PATH |
2316 				    MDI_SELECT_ONLINE_PATH |
2317 				    MDI_SELECT_NO_PREFERRED)) {
2318 					cond = (((MDI_PI(pip)->pi_state ==
2319 					    MDI_PATHINFO_STATE_ONLINE) ||
2320 					    (MDI_PI(pip)->pi_state ==
2321 					    MDI_PATHINFO_STATE_STANDBY))
2322 					    ? 1 : 0);
2323 				} else {
2324 					cond = 0;
2325 				}
2326 			}
2327 			/*
2328 			 * No need to explicitly check if the path is disabled.
2329 			 * Since we are checking for state == ONLINE and the
2330 			 * same variable is used for DISABLE/ENABLE information.
2331 			 */
2332 			if (cond) {
2333 				/*
2334 				 * Return the path in hold state. Caller should
2335 				 * release the lock by calling mdi_rele_path()
2336 				 */
2337 				MDI_PI_HOLD(pip);
2338 				MDI_PI_UNLOCK(pip);
2339 				if (sb)
2340 					ct->ct_path_last = pip;
2341 				*ret_pip = pip;
2342 				MDI_CLIENT_UNLOCK(ct);
2343 				return (MDI_SUCCESS);
2344 			}
2345 			/*
2346 			 * Path is busy.
2347 			 */
2348 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2349 			    MDI_PI_IS_TRANSIENT(pip))
2350 				retry = 1;
2351 
2352 			/*
2353 			 * Keep looking for a next available online path
2354 			 */
2355 do_again:
2356 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2357 			if (next == NULL) {
2358 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2359 					/*
2360 					 * Bail out since we hit the end of list
2361 					 */
2362 					MDI_PI_UNLOCK(pip);
2363 					break;
2364 				}
2365 
2366 				if (!sb) {
2367 					if (preferred == 1) {
2368 						/*
2369 						 * Looks like we reached the
2370 						 * end of the list. Lets enable
2371 						 * traversal of non preferred
2372 						 * paths.
2373 						 */
2374 						preferred = 0;
2375 						next = head;
2376 					} else {
2377 						/*
2378 						 * We have done both the passes
2379 						 * Preferred as well as for
2380 						 * Non-preferred. Bail out now.
2381 						 */
2382 						cont = 0;
2383 					}
2384 				} else {
2385 					/*
2386 					 * Standard behavior case.
2387 					 */
2388 					next = head;
2389 				}
2390 			}
2391 			MDI_PI_UNLOCK(pip);
2392 			if (cont == 0) {
2393 				break;
2394 			}
2395 			pip = next;
2396 
2397 			if (!sb) {
2398 				/*
2399 				 * We need to handle the selection of
2400 				 * non-preferred path in the following
2401 				 * case:
2402 				 *
2403 				 * +------+   +------+   +------+   +-----+
2404 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2405 				 * +------+   +------+   +------+   +-----+
2406 				 *
2407 				 * If we start the search with B, we need to
2408 				 * skip beyond B to pick C which is non -
2409 				 * preferred in the second pass. The following
2410 				 * test, if true, will allow us to skip over
2411 				 * the 'start'(B in the example) to select
2412 				 * other non preferred elements.
2413 				 */
2414 				if ((start_pip != NULL) && (start_pip == pip) &&
2415 				    (MDI_PI(start_pip)->pi_preferred
2416 				    != preferred)) {
2417 					/*
2418 					 * try again after going past the start
2419 					 * pip
2420 					 */
2421 					MDI_PI_LOCK(pip);
2422 					goto do_again;
2423 				}
2424 			} else {
2425 				/*
2426 				 * Standard behavior case
2427 				 */
2428 				if (start == pip && preferred) {
2429 					/* look for nonpreferred paths */
2430 					preferred = 0;
2431 				} else if (start == pip && !preferred) {
2432 					/*
2433 					 * Exit condition
2434 					 */
2435 					cont = 0;
2436 				}
2437 			}
2438 		} while (cont);
2439 		break;
2440 	}
2441 
2442 	MDI_CLIENT_UNLOCK(ct);
2443 	if (retry == 1) {
2444 		return (MDI_BUSY);
2445 	} else {
2446 		return (MDI_NOPATH);
2447 	}
2448 }
2449 
2450 /*
2451  * For a client, return the next available path to any phci
2452  *
2453  * Note:
2454  *		Caller should hold the branch's devinfo node to get a consistent
2455  *		snap shot of the mdi_pathinfo nodes.
2456  *
2457  *		Please note that even the list is stable the mdi_pathinfo
2458  *		node state and properties are volatile.  The caller should lock
2459  *		and unlock the nodes by calling mdi_pi_lock() and
2460  *		mdi_pi_unlock() functions to get a stable properties.
2461  *
2462  *		If there is a need to use the nodes beyond the hold of the
2463  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2464  *		need to be held against unexpected removal by calling
2465  *		mdi_hold_path() and should be released by calling
2466  *		mdi_rele_path() on completion.
2467  */
2468 mdi_pathinfo_t *
2469 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2470 {
2471 	mdi_client_t *ct;
2472 
2473 	if (!MDI_CLIENT(ct_dip))
2474 		return (NULL);
2475 
2476 	/*
2477 	 * Walk through client link
2478 	 */
2479 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2480 	ASSERT(ct != NULL);
2481 
2482 	if (pip == NULL)
2483 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2484 
2485 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2486 }
2487 
2488 /*
2489  * For a phci, return the next available path to any client
2490  * Note: ditto mdi_get_next_phci_path()
2491  */
2492 mdi_pathinfo_t *
2493 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2494 {
2495 	mdi_phci_t *ph;
2496 
2497 	if (!MDI_PHCI(ph_dip))
2498 		return (NULL);
2499 
2500 	/*
2501 	 * Walk through pHCI link
2502 	 */
2503 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2504 	ASSERT(ph != NULL);
2505 
2506 	if (pip == NULL)
2507 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2508 
2509 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2510 }
2511 
2512 /*
2513  * mdi_hold_path():
2514  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2515  * Return Values:
2516  *		None
2517  */
2518 void
2519 mdi_hold_path(mdi_pathinfo_t *pip)
2520 {
2521 	if (pip) {
2522 		MDI_PI_LOCK(pip);
2523 		MDI_PI_HOLD(pip);
2524 		MDI_PI_UNLOCK(pip);
2525 	}
2526 }
2527 
2528 
2529 /*
2530  * mdi_rele_path():
2531  *		Release the mdi_pathinfo node which was selected
2532  *		through mdi_select_path() mechanism or manually held by
2533  *		calling mdi_hold_path().
2534  * Return Values:
2535  *		None
2536  */
2537 void
2538 mdi_rele_path(mdi_pathinfo_t *pip)
2539 {
2540 	if (pip) {
2541 		MDI_PI_LOCK(pip);
2542 		MDI_PI_RELE(pip);
2543 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2544 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2545 		}
2546 		MDI_PI_UNLOCK(pip);
2547 	}
2548 }
2549 
2550 /*
2551  * mdi_pi_lock():
2552  * 		Lock the mdi_pathinfo node.
2553  * Note:
2554  *		The caller should release the lock by calling mdi_pi_unlock()
2555  */
2556 void
2557 mdi_pi_lock(mdi_pathinfo_t *pip)
2558 {
2559 	ASSERT(pip != NULL);
2560 	if (pip) {
2561 		MDI_PI_LOCK(pip);
2562 	}
2563 }
2564 
2565 
2566 /*
2567  * mdi_pi_unlock():
2568  * 		Unlock the mdi_pathinfo node.
2569  * Note:
2570  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2571  */
2572 void
2573 mdi_pi_unlock(mdi_pathinfo_t *pip)
2574 {
2575 	ASSERT(pip != NULL);
2576 	if (pip) {
2577 		MDI_PI_UNLOCK(pip);
2578 	}
2579 }
2580 
2581 /*
2582  * mdi_pi_find():
2583  *		Search the list of mdi_pathinfo nodes attached to the
2584  *		pHCI/Client device node whose path address matches "paddr".
2585  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2586  *		found.
2587  * Return Values:
2588  *		mdi_pathinfo node handle
2589  *		NULL
2590  * Notes:
2591  *		Caller need not hold any locks to call this function.
2592  */
2593 mdi_pathinfo_t *
2594 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2595 {
2596 	mdi_phci_t		*ph;
2597 	mdi_vhci_t		*vh;
2598 	mdi_client_t		*ct;
2599 	mdi_pathinfo_t		*pip = NULL;
2600 
2601 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: %s %s",
2602 	    caddr ? caddr : "NULL", paddr ? paddr : "NULL"));
2603 	if ((pdip == NULL) || (paddr == NULL)) {
2604 		return (NULL);
2605 	}
2606 	ph = i_devi_get_phci(pdip);
2607 	if (ph == NULL) {
2608 		/*
2609 		 * Invalid pHCI device, Nothing more to do.
2610 		 */
2611 		MDI_DEBUG(2, (CE_WARN, pdip,
2612 		    "!mdi_pi_find: invalid phci"));
2613 		return (NULL);
2614 	}
2615 
2616 	vh = ph->ph_vhci;
2617 	if (vh == NULL) {
2618 		/*
2619 		 * Invalid vHCI device, Nothing more to do.
2620 		 */
2621 		MDI_DEBUG(2, (CE_WARN, pdip,
2622 		    "!mdi_pi_find: invalid vhci"));
2623 		return (NULL);
2624 	}
2625 
2626 	/*
2627 	 * Look for pathinfo node identified by paddr.
2628 	 */
2629 	if (caddr == NULL) {
2630 		/*
2631 		 * Find a mdi_pathinfo node under pHCI list for a matching
2632 		 * unit address.
2633 		 */
2634 		MDI_PHCI_LOCK(ph);
2635 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2636 			MDI_DEBUG(2, (CE_WARN, pdip,
2637 			    "!mdi_pi_find: offline phci %p", (void *)ph));
2638 			MDI_PHCI_UNLOCK(ph);
2639 			return (NULL);
2640 		}
2641 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2642 
2643 		while (pip != NULL) {
2644 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2645 				break;
2646 			}
2647 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2648 		}
2649 		MDI_PHCI_UNLOCK(ph);
2650 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found %p",
2651 		    (void *)pip));
2652 		return (pip);
2653 	}
2654 
2655 	/*
2656 	 * XXX - Is the rest of the code in this function really necessary?
2657 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2658 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2659 	 * whether the search is based on the pathinfo nodes attached to
2660 	 * the pHCI or the client node, the result will be the same.
2661 	 */
2662 
2663 	/*
2664 	 * Find the client device corresponding to 'caddr'
2665 	 */
2666 	MDI_VHCI_CLIENT_LOCK(vh);
2667 
2668 	/*
2669 	 * XXX - Passing NULL to the following function works as long as the
2670 	 * the client addresses (caddr) are unique per vhci basis.
2671 	 */
2672 	ct = i_mdi_client_find(vh, NULL, caddr);
2673 	if (ct == NULL) {
2674 		/*
2675 		 * Client not found, Obviously mdi_pathinfo node has not been
2676 		 * created yet.
2677 		 */
2678 		MDI_VHCI_CLIENT_UNLOCK(vh);
2679 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: client not "
2680 		    "found for caddr %s", caddr ? caddr : "NULL"));
2681 		return (NULL);
2682 	}
2683 
2684 	/*
2685 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2686 	 * pHCI and paddr
2687 	 */
2688 	MDI_CLIENT_LOCK(ct);
2689 
2690 	/*
2691 	 * Release the global mutex as it is no more needed. Note: We always
2692 	 * respect the locking order while acquiring.
2693 	 */
2694 	MDI_VHCI_CLIENT_UNLOCK(vh);
2695 
2696 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2697 	while (pip != NULL) {
2698 		/*
2699 		 * Compare the unit address
2700 		 */
2701 		if ((MDI_PI(pip)->pi_phci == ph) &&
2702 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2703 			break;
2704 		}
2705 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2706 	}
2707 	MDI_CLIENT_UNLOCK(ct);
2708 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found:: %p", (void *)pip));
2709 	return (pip);
2710 }
2711 
2712 /*
2713  * mdi_pi_alloc():
2714  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2715  *		The mdi_pathinfo node returned by this function identifies a
2716  *		unique device path is capable of having properties attached
2717  *		and passed to mdi_pi_online() to fully attach and online the
2718  *		path and client device node.
2719  *		The mdi_pathinfo node returned by this function must be
2720  *		destroyed using mdi_pi_free() if the path is no longer
2721  *		operational or if the caller fails to attach a client device
2722  *		node when calling mdi_pi_online(). The framework will not free
2723  *		the resources allocated.
2724  *		This function can be called from both interrupt and kernel
2725  *		contexts.  DDI_NOSLEEP flag should be used while calling
2726  *		from interrupt contexts.
2727  * Return Values:
2728  *		MDI_SUCCESS
2729  *		MDI_FAILURE
2730  *		MDI_NOMEM
2731  */
2732 /*ARGSUSED*/
2733 int
2734 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2735     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2736 {
2737 	mdi_vhci_t	*vh;
2738 	mdi_phci_t	*ph;
2739 	mdi_client_t	*ct;
2740 	mdi_pathinfo_t	*pip = NULL;
2741 	dev_info_t	*cdip;
2742 	int		rv = MDI_NOMEM;
2743 	int		path_allocated = 0;
2744 
2745 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_alloc_compatible: %s %s %s",
2746 	    cname ? cname : "NULL", caddr ? caddr : "NULL",
2747 	    paddr ? paddr : "NULL"));
2748 
2749 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2750 	    ret_pip == NULL) {
2751 		/* Nothing more to do */
2752 		return (MDI_FAILURE);
2753 	}
2754 
2755 	*ret_pip = NULL;
2756 
2757 	/* No allocations on detaching pHCI */
2758 	if (DEVI_IS_DETACHING(pdip)) {
2759 		/* Invalid pHCI device, return failure */
2760 		MDI_DEBUG(1, (CE_WARN, pdip,
2761 		    "!mdi_pi_alloc: detaching pHCI=%p", (void *)pdip));
2762 		return (MDI_FAILURE);
2763 	}
2764 
2765 	ph = i_devi_get_phci(pdip);
2766 	ASSERT(ph != NULL);
2767 	if (ph == NULL) {
2768 		/* Invalid pHCI device, return failure */
2769 		MDI_DEBUG(1, (CE_WARN, pdip,
2770 		    "!mdi_pi_alloc: invalid pHCI=%p", (void *)pdip));
2771 		return (MDI_FAILURE);
2772 	}
2773 
2774 	MDI_PHCI_LOCK(ph);
2775 	vh = ph->ph_vhci;
2776 	if (vh == NULL) {
2777 		/* Invalid vHCI device, return failure */
2778 		MDI_DEBUG(1, (CE_WARN, pdip,
2779 		    "!mdi_pi_alloc: invalid vHCI=%p", (void *)pdip));
2780 		MDI_PHCI_UNLOCK(ph);
2781 		return (MDI_FAILURE);
2782 	}
2783 
2784 	if (MDI_PHCI_IS_READY(ph) == 0) {
2785 		/*
2786 		 * Do not allow new node creation when pHCI is in
2787 		 * offline/suspended states
2788 		 */
2789 		MDI_DEBUG(1, (CE_WARN, pdip,
2790 		    "mdi_pi_alloc: pHCI=%p is not ready", (void *)ph));
2791 		MDI_PHCI_UNLOCK(ph);
2792 		return (MDI_BUSY);
2793 	}
2794 	MDI_PHCI_UNSTABLE(ph);
2795 	MDI_PHCI_UNLOCK(ph);
2796 
2797 	/* look for a matching client, create one if not found */
2798 	MDI_VHCI_CLIENT_LOCK(vh);
2799 	ct = i_mdi_client_find(vh, cname, caddr);
2800 	if (ct == NULL) {
2801 		ct = i_mdi_client_alloc(vh, cname, caddr);
2802 		ASSERT(ct != NULL);
2803 	}
2804 
2805 	if (ct->ct_dip == NULL) {
2806 		/*
2807 		 * Allocate a devinfo node
2808 		 */
2809 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2810 		    compatible, ncompatible);
2811 		if (ct->ct_dip == NULL) {
2812 			(void) i_mdi_client_free(vh, ct);
2813 			goto fail;
2814 		}
2815 	}
2816 	cdip = ct->ct_dip;
2817 
2818 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2819 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2820 
2821 	MDI_CLIENT_LOCK(ct);
2822 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2823 	while (pip != NULL) {
2824 		/*
2825 		 * Compare the unit address
2826 		 */
2827 		if ((MDI_PI(pip)->pi_phci == ph) &&
2828 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2829 			break;
2830 		}
2831 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2832 	}
2833 	MDI_CLIENT_UNLOCK(ct);
2834 
2835 	if (pip == NULL) {
2836 		/*
2837 		 * This is a new path for this client device.  Allocate and
2838 		 * initialize a new pathinfo node
2839 		 */
2840 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2841 		ASSERT(pip != NULL);
2842 		path_allocated = 1;
2843 	}
2844 	rv = MDI_SUCCESS;
2845 
2846 fail:
2847 	/*
2848 	 * Release the global mutex.
2849 	 */
2850 	MDI_VHCI_CLIENT_UNLOCK(vh);
2851 
2852 	/*
2853 	 * Mark the pHCI as stable
2854 	 */
2855 	MDI_PHCI_LOCK(ph);
2856 	MDI_PHCI_STABLE(ph);
2857 	MDI_PHCI_UNLOCK(ph);
2858 	*ret_pip = pip;
2859 
2860 	MDI_DEBUG(2, (CE_NOTE, pdip,
2861 	    "!mdi_pi_alloc_compatible: alloc %p", (void *)pip));
2862 
2863 	if (path_allocated)
2864 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2865 
2866 	return (rv);
2867 }
2868 
2869 /*ARGSUSED*/
2870 int
2871 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2872     int flags, mdi_pathinfo_t **ret_pip)
2873 {
2874 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2875 	    flags, ret_pip));
2876 }
2877 
2878 /*
2879  * i_mdi_pi_alloc():
2880  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2881  * Return Values:
2882  *		mdi_pathinfo
2883  */
2884 /*ARGSUSED*/
2885 static mdi_pathinfo_t *
2886 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2887 {
2888 	mdi_pathinfo_t	*pip;
2889 	int		ct_circular;
2890 	int		ph_circular;
2891 	static char	path[MAXPATHLEN];
2892 	char		*path_persistent;
2893 	int		path_instance;
2894 	mod_hash_val_t	hv;
2895 
2896 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2897 
2898 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2899 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2900 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2901 	    MDI_PATHINFO_STATE_TRANSIENT;
2902 
2903 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2904 		MDI_PI_SET_USER_DISABLE(pip);
2905 
2906 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2907 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2908 
2909 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2910 		MDI_PI_SET_DRV_DISABLE(pip);
2911 
2912 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2913 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2914 	MDI_PI(pip)->pi_client = ct;
2915 	MDI_PI(pip)->pi_phci = ph;
2916 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2917 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2918 
2919         /*
2920 	 * We form the "path" to the pathinfo node, and see if we have
2921 	 * already allocated a 'path_instance' for that "path".  If so,
2922 	 * we use the already allocated 'path_instance'.  If not, we
2923 	 * allocate a new 'path_instance' and associate it with a copy of
2924 	 * the "path" string (which is never freed). The association
2925 	 * between a 'path_instance' this "path" string persists until
2926 	 * reboot.
2927 	 */
2928         mutex_enter(&mdi_pathmap_mutex);
2929 	(void) ddi_pathname(ph->ph_dip, path);
2930 	(void) sprintf(path + strlen(path), "/%s@%s",
2931 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2932         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2933                 path_instance = (uint_t)(intptr_t)hv;
2934         } else {
2935 		/* allocate a new 'path_instance' and persistent "path" */
2936 		path_instance = mdi_pathmap_instance++;
2937 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2938                 (void) mod_hash_insert(mdi_pathmap_bypath,
2939                     (mod_hash_key_t)path_persistent,
2940                     (mod_hash_val_t)(intptr_t)path_instance);
2941 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2942 		    (mod_hash_key_t)(intptr_t)path_instance,
2943 		    (mod_hash_val_t)path_persistent);
2944         }
2945         mutex_exit(&mdi_pathmap_mutex);
2946 	MDI_PI(pip)->pi_path_instance = path_instance;
2947 
2948 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2949 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2950 	MDI_PI(pip)->pi_pprivate = NULL;
2951 	MDI_PI(pip)->pi_cprivate = NULL;
2952 	MDI_PI(pip)->pi_vprivate = NULL;
2953 	MDI_PI(pip)->pi_client_link = NULL;
2954 	MDI_PI(pip)->pi_phci_link = NULL;
2955 	MDI_PI(pip)->pi_ref_cnt = 0;
2956 	MDI_PI(pip)->pi_kstats = NULL;
2957 	MDI_PI(pip)->pi_preferred = 1;
2958 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2959 
2960 	/*
2961 	 * Lock both dev_info nodes against changes in parallel.
2962 	 *
2963 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
2964 	 * This atypical operation is done to synchronize pathinfo nodes
2965 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
2966 	 * the pathinfo nodes are children of the Client.
2967 	 */
2968 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2969 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2970 
2971 	i_mdi_phci_add_path(ph, pip);
2972 	i_mdi_client_add_path(ct, pip);
2973 
2974 	ndi_devi_exit(ph->ph_dip, ph_circular);
2975 	ndi_devi_exit(ct->ct_dip, ct_circular);
2976 
2977 	return (pip);
2978 }
2979 
2980 /*
2981  * mdi_pi_pathname_by_instance():
2982  *	Lookup of "path" by 'path_instance'. Return "path".
2983  *	NOTE: returned "path" remains valid forever (until reboot).
2984  */
2985 char *
2986 mdi_pi_pathname_by_instance(int path_instance)
2987 {
2988 	char		*path;
2989 	mod_hash_val_t	hv;
2990 
2991 	/* mdi_pathmap lookup of "path" by 'path_instance' */
2992 	mutex_enter(&mdi_pathmap_mutex);
2993 	if (mod_hash_find(mdi_pathmap_byinstance,
2994 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
2995 		path = (char *)hv;
2996 	else
2997 		path = NULL;
2998 	mutex_exit(&mdi_pathmap_mutex);
2999 	return (path);
3000 }
3001 
3002 /*
3003  * i_mdi_phci_add_path():
3004  * 		Add a mdi_pathinfo node to pHCI list.
3005  * Notes:
3006  *		Caller should per-pHCI mutex
3007  */
3008 static void
3009 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3010 {
3011 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3012 
3013 	MDI_PHCI_LOCK(ph);
3014 	if (ph->ph_path_head == NULL) {
3015 		ph->ph_path_head = pip;
3016 	} else {
3017 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3018 	}
3019 	ph->ph_path_tail = pip;
3020 	ph->ph_path_count++;
3021 	MDI_PHCI_UNLOCK(ph);
3022 }
3023 
3024 /*
3025  * i_mdi_client_add_path():
3026  *		Add mdi_pathinfo node to client list
3027  */
3028 static void
3029 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3030 {
3031 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3032 
3033 	MDI_CLIENT_LOCK(ct);
3034 	if (ct->ct_path_head == NULL) {
3035 		ct->ct_path_head = pip;
3036 	} else {
3037 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3038 	}
3039 	ct->ct_path_tail = pip;
3040 	ct->ct_path_count++;
3041 	MDI_CLIENT_UNLOCK(ct);
3042 }
3043 
3044 /*
3045  * mdi_pi_free():
3046  *		Free the mdi_pathinfo node and also client device node if this
3047  *		is the last path to the device
3048  * Return Values:
3049  *		MDI_SUCCESS
3050  *		MDI_FAILURE
3051  *		MDI_BUSY
3052  */
3053 /*ARGSUSED*/
3054 int
3055 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3056 {
3057 	int		rv = MDI_FAILURE;
3058 	mdi_vhci_t	*vh;
3059 	mdi_phci_t	*ph;
3060 	mdi_client_t	*ct;
3061 	int		(*f)();
3062 	int		client_held = 0;
3063 
3064 	MDI_PI_LOCK(pip);
3065 	ph = MDI_PI(pip)->pi_phci;
3066 	ASSERT(ph != NULL);
3067 	if (ph == NULL) {
3068 		/*
3069 		 * Invalid pHCI device, return failure
3070 		 */
3071 		MDI_DEBUG(1, (CE_WARN, NULL,
3072 		    "!mdi_pi_free: invalid pHCI pip=%p", (void *)pip));
3073 		MDI_PI_UNLOCK(pip);
3074 		return (MDI_FAILURE);
3075 	}
3076 
3077 	vh = ph->ph_vhci;
3078 	ASSERT(vh != NULL);
3079 	if (vh == NULL) {
3080 		/* Invalid pHCI device, return failure */
3081 		MDI_DEBUG(1, (CE_WARN, NULL,
3082 		    "!mdi_pi_free: invalid vHCI pip=%p", (void *)pip));
3083 		MDI_PI_UNLOCK(pip);
3084 		return (MDI_FAILURE);
3085 	}
3086 
3087 	ct = MDI_PI(pip)->pi_client;
3088 	ASSERT(ct != NULL);
3089 	if (ct == NULL) {
3090 		/*
3091 		 * Invalid Client device, return failure
3092 		 */
3093 		MDI_DEBUG(1, (CE_WARN, NULL,
3094 		    "!mdi_pi_free: invalid client pip=%p", (void *)pip));
3095 		MDI_PI_UNLOCK(pip);
3096 		return (MDI_FAILURE);
3097 	}
3098 
3099 	/*
3100 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3101 	 * if the node state is either offline or init and the reference count
3102 	 * is zero.
3103 	 */
3104 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3105 	    MDI_PI_IS_INITING(pip))) {
3106 		/*
3107 		 * Node is busy
3108 		 */
3109 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3110 		    "!mdi_pi_free: pathinfo node is busy pip=%p", (void *)pip));
3111 		MDI_PI_UNLOCK(pip);
3112 		return (MDI_BUSY);
3113 	}
3114 
3115 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3116 		/*
3117 		 * Give a chance for pending I/Os to complete.
3118 		 */
3119 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!mdi_pi_free: "
3120 		    "%d cmds still pending on path: %p\n",
3121 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3122 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3123 		    &MDI_PI(pip)->pi_mutex,
3124 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3125 			/*
3126 			 * The timeout time reached without ref_cnt being zero
3127 			 * being signaled.
3128 			 */
3129 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
3130 			    "!mdi_pi_free: "
3131 			    "Timeout reached on path %p without the cond\n",
3132 			    (void *)pip));
3133 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
3134 			    "!mdi_pi_free: "
3135 			    "%d cmds still pending on path: %p\n",
3136 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3137 			MDI_PI_UNLOCK(pip);
3138 			return (MDI_BUSY);
3139 		}
3140 	}
3141 	if (MDI_PI(pip)->pi_pm_held) {
3142 		client_held = 1;
3143 	}
3144 	MDI_PI_UNLOCK(pip);
3145 
3146 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3147 
3148 	MDI_CLIENT_LOCK(ct);
3149 
3150 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3151 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3152 
3153 	/*
3154 	 * Wait till failover is complete before removing this node.
3155 	 */
3156 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3157 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3158 
3159 	MDI_CLIENT_UNLOCK(ct);
3160 	MDI_VHCI_CLIENT_LOCK(vh);
3161 	MDI_CLIENT_LOCK(ct);
3162 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3163 
3164 	if (!MDI_PI_IS_INITING(pip)) {
3165 		f = vh->vh_ops->vo_pi_uninit;
3166 		if (f != NULL) {
3167 			rv = (*f)(vh->vh_dip, pip, 0);
3168 		}
3169 	}
3170 	/*
3171 	 * If vo_pi_uninit() completed successfully.
3172 	 */
3173 	if (rv == MDI_SUCCESS) {
3174 		if (client_held) {
3175 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
3176 			    "i_mdi_pm_rele_client\n"));
3177 			i_mdi_pm_rele_client(ct, 1);
3178 		}
3179 		i_mdi_pi_free(ph, pip, ct);
3180 		if (ct->ct_path_count == 0) {
3181 			/*
3182 			 * Client lost its last path.
3183 			 * Clean up the client device
3184 			 */
3185 			MDI_CLIENT_UNLOCK(ct);
3186 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3187 			MDI_VHCI_CLIENT_UNLOCK(vh);
3188 			return (rv);
3189 		}
3190 	}
3191 	MDI_CLIENT_UNLOCK(ct);
3192 	MDI_VHCI_CLIENT_UNLOCK(vh);
3193 
3194 	if (rv == MDI_FAILURE)
3195 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3196 
3197 	return (rv);
3198 }
3199 
3200 /*
3201  * i_mdi_pi_free():
3202  *		Free the mdi_pathinfo node
3203  */
3204 static void
3205 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3206 {
3207 	int	ct_circular;
3208 	int	ph_circular;
3209 
3210 	ASSERT(MDI_CLIENT_LOCKED(ct));
3211 
3212 	/*
3213 	 * remove any per-path kstats
3214 	 */
3215 	i_mdi_pi_kstat_destroy(pip);
3216 
3217 	/* See comments in i_mdi_pi_alloc() */
3218 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3219 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3220 
3221 	i_mdi_client_remove_path(ct, pip);
3222 	i_mdi_phci_remove_path(ph, pip);
3223 
3224 	ndi_devi_exit(ph->ph_dip, ph_circular);
3225 	ndi_devi_exit(ct->ct_dip, ct_circular);
3226 
3227 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3228 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3229 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3230 	if (MDI_PI(pip)->pi_addr) {
3231 		kmem_free(MDI_PI(pip)->pi_addr,
3232 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3233 		MDI_PI(pip)->pi_addr = NULL;
3234 	}
3235 
3236 	if (MDI_PI(pip)->pi_prop) {
3237 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3238 		MDI_PI(pip)->pi_prop = NULL;
3239 	}
3240 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3241 }
3242 
3243 
3244 /*
3245  * i_mdi_phci_remove_path():
3246  * 		Remove a mdi_pathinfo node from pHCI list.
3247  * Notes:
3248  *		Caller should hold per-pHCI mutex
3249  */
3250 static void
3251 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3252 {
3253 	mdi_pathinfo_t	*prev = NULL;
3254 	mdi_pathinfo_t	*path = NULL;
3255 
3256 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3257 
3258 	MDI_PHCI_LOCK(ph);
3259 	path = ph->ph_path_head;
3260 	while (path != NULL) {
3261 		if (path == pip) {
3262 			break;
3263 		}
3264 		prev = path;
3265 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3266 	}
3267 
3268 	if (path) {
3269 		ph->ph_path_count--;
3270 		if (prev) {
3271 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3272 		} else {
3273 			ph->ph_path_head =
3274 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3275 		}
3276 		if (ph->ph_path_tail == path) {
3277 			ph->ph_path_tail = prev;
3278 		}
3279 	}
3280 
3281 	/*
3282 	 * Clear the pHCI link
3283 	 */
3284 	MDI_PI(pip)->pi_phci_link = NULL;
3285 	MDI_PI(pip)->pi_phci = NULL;
3286 	MDI_PHCI_UNLOCK(ph);
3287 }
3288 
3289 /*
3290  * i_mdi_client_remove_path():
3291  * 		Remove a mdi_pathinfo node from client path list.
3292  */
3293 static void
3294 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3295 {
3296 	mdi_pathinfo_t	*prev = NULL;
3297 	mdi_pathinfo_t	*path;
3298 
3299 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3300 
3301 	ASSERT(MDI_CLIENT_LOCKED(ct));
3302 	path = ct->ct_path_head;
3303 	while (path != NULL) {
3304 		if (path == pip) {
3305 			break;
3306 		}
3307 		prev = path;
3308 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3309 	}
3310 
3311 	if (path) {
3312 		ct->ct_path_count--;
3313 		if (prev) {
3314 			MDI_PI(prev)->pi_client_link =
3315 			    MDI_PI(path)->pi_client_link;
3316 		} else {
3317 			ct->ct_path_head =
3318 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3319 		}
3320 		if (ct->ct_path_tail == path) {
3321 			ct->ct_path_tail = prev;
3322 		}
3323 		if (ct->ct_path_last == path) {
3324 			ct->ct_path_last = ct->ct_path_head;
3325 		}
3326 	}
3327 	MDI_PI(pip)->pi_client_link = NULL;
3328 	MDI_PI(pip)->pi_client = NULL;
3329 }
3330 
3331 /*
3332  * i_mdi_pi_state_change():
3333  *		online a mdi_pathinfo node
3334  *
3335  * Return Values:
3336  *		MDI_SUCCESS
3337  *		MDI_FAILURE
3338  */
3339 /*ARGSUSED*/
3340 static int
3341 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3342 {
3343 	int		rv = MDI_SUCCESS;
3344 	mdi_vhci_t	*vh;
3345 	mdi_phci_t	*ph;
3346 	mdi_client_t	*ct;
3347 	int		(*f)();
3348 	dev_info_t	*cdip;
3349 
3350 	MDI_PI_LOCK(pip);
3351 
3352 	ph = MDI_PI(pip)->pi_phci;
3353 	ASSERT(ph);
3354 	if (ph == NULL) {
3355 		/*
3356 		 * Invalid pHCI device, fail the request
3357 		 */
3358 		MDI_PI_UNLOCK(pip);
3359 		MDI_DEBUG(1, (CE_WARN, NULL,
3360 		    "!mdi_pi_state_change: invalid phci pip=%p", (void *)pip));
3361 		return (MDI_FAILURE);
3362 	}
3363 
3364 	vh = ph->ph_vhci;
3365 	ASSERT(vh);
3366 	if (vh == NULL) {
3367 		/*
3368 		 * Invalid vHCI device, fail the request
3369 		 */
3370 		MDI_PI_UNLOCK(pip);
3371 		MDI_DEBUG(1, (CE_WARN, NULL,
3372 		    "!mdi_pi_state_change: invalid vhci pip=%p", (void *)pip));
3373 		return (MDI_FAILURE);
3374 	}
3375 
3376 	ct = MDI_PI(pip)->pi_client;
3377 	ASSERT(ct != NULL);
3378 	if (ct == NULL) {
3379 		/*
3380 		 * Invalid client device, fail the request
3381 		 */
3382 		MDI_PI_UNLOCK(pip);
3383 		MDI_DEBUG(1, (CE_WARN, NULL,
3384 		    "!mdi_pi_state_change: invalid client pip=%p",
3385 		    (void *)pip));
3386 		return (MDI_FAILURE);
3387 	}
3388 
3389 	/*
3390 	 * If this path has not been initialized yet, Callback vHCI driver's
3391 	 * pathinfo node initialize entry point
3392 	 */
3393 
3394 	if (MDI_PI_IS_INITING(pip)) {
3395 		MDI_PI_UNLOCK(pip);
3396 		f = vh->vh_ops->vo_pi_init;
3397 		if (f != NULL) {
3398 			rv = (*f)(vh->vh_dip, pip, 0);
3399 			if (rv != MDI_SUCCESS) {
3400 				MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3401 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3402 				    (void *)vh, (void *)pip));
3403 				return (MDI_FAILURE);
3404 			}
3405 		}
3406 		MDI_PI_LOCK(pip);
3407 		MDI_PI_CLEAR_TRANSIENT(pip);
3408 	}
3409 
3410 	/*
3411 	 * Do not allow state transition when pHCI is in offline/suspended
3412 	 * states
3413 	 */
3414 	i_mdi_phci_lock(ph, pip);
3415 	if (MDI_PHCI_IS_READY(ph) == 0) {
3416 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3417 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p",
3418 		    (void *)ph));
3419 		MDI_PI_UNLOCK(pip);
3420 		i_mdi_phci_unlock(ph);
3421 		return (MDI_BUSY);
3422 	}
3423 	MDI_PHCI_UNSTABLE(ph);
3424 	i_mdi_phci_unlock(ph);
3425 
3426 	/*
3427 	 * Check if mdi_pathinfo state is in transient state.
3428 	 * If yes, offlining is in progress and wait till transient state is
3429 	 * cleared.
3430 	 */
3431 	if (MDI_PI_IS_TRANSIENT(pip)) {
3432 		while (MDI_PI_IS_TRANSIENT(pip)) {
3433 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3434 			    &MDI_PI(pip)->pi_mutex);
3435 		}
3436 	}
3437 
3438 	/*
3439 	 * Grab the client lock in reverse order sequence and release the
3440 	 * mdi_pathinfo mutex.
3441 	 */
3442 	i_mdi_client_lock(ct, pip);
3443 	MDI_PI_UNLOCK(pip);
3444 
3445 	/*
3446 	 * Wait till failover state is cleared
3447 	 */
3448 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3449 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3450 
3451 	/*
3452 	 * Mark the mdi_pathinfo node state as transient
3453 	 */
3454 	MDI_PI_LOCK(pip);
3455 	switch (state) {
3456 	case MDI_PATHINFO_STATE_ONLINE:
3457 		MDI_PI_SET_ONLINING(pip);
3458 		break;
3459 
3460 	case MDI_PATHINFO_STATE_STANDBY:
3461 		MDI_PI_SET_STANDBYING(pip);
3462 		break;
3463 
3464 	case MDI_PATHINFO_STATE_FAULT:
3465 		/*
3466 		 * Mark the pathinfo state as FAULTED
3467 		 */
3468 		MDI_PI_SET_FAULTING(pip);
3469 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3470 		break;
3471 
3472 	case MDI_PATHINFO_STATE_OFFLINE:
3473 		/*
3474 		 * ndi_devi_offline() cannot hold pip or ct locks.
3475 		 */
3476 		MDI_PI_UNLOCK(pip);
3477 		/*
3478 		 * Don't offline the client dev_info node unless we have
3479 		 * no available paths left at all.
3480 		 */
3481 		cdip = ct->ct_dip;
3482 		if ((flag & NDI_DEVI_REMOVE) &&
3483 		    (ct->ct_path_count == 1)) {
3484 			i_mdi_client_unlock(ct);
3485 			rv = ndi_devi_offline(cdip, 0);
3486 			if (rv != NDI_SUCCESS) {
3487 				/*
3488 				 * Convert to MDI error code
3489 				 */
3490 				switch (rv) {
3491 				case NDI_BUSY:
3492 					rv = MDI_BUSY;
3493 					break;
3494 				default:
3495 					rv = MDI_FAILURE;
3496 					break;
3497 				}
3498 				goto state_change_exit;
3499 			} else {
3500 				i_mdi_client_lock(ct, NULL);
3501 			}
3502 		}
3503 		/*
3504 		 * Mark the mdi_pathinfo node state as transient
3505 		 */
3506 		MDI_PI_LOCK(pip);
3507 		MDI_PI_SET_OFFLINING(pip);
3508 		break;
3509 	}
3510 	MDI_PI_UNLOCK(pip);
3511 	MDI_CLIENT_UNSTABLE(ct);
3512 	i_mdi_client_unlock(ct);
3513 
3514 	f = vh->vh_ops->vo_pi_state_change;
3515 	if (f != NULL)
3516 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3517 
3518 	MDI_CLIENT_LOCK(ct);
3519 	MDI_PI_LOCK(pip);
3520 	if (rv == MDI_NOT_SUPPORTED) {
3521 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3522 	}
3523 	if (rv != MDI_SUCCESS) {
3524 		MDI_DEBUG(2, (CE_WARN, ct->ct_dip,
3525 		    "!vo_pi_state_change: failed rv = %x", rv));
3526 	}
3527 	if (MDI_PI_IS_TRANSIENT(pip)) {
3528 		if (rv == MDI_SUCCESS) {
3529 			MDI_PI_CLEAR_TRANSIENT(pip);
3530 		} else {
3531 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3532 		}
3533 	}
3534 
3535 	/*
3536 	 * Wake anyone waiting for this mdi_pathinfo node
3537 	 */
3538 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3539 	MDI_PI_UNLOCK(pip);
3540 
3541 	/*
3542 	 * Mark the client device as stable
3543 	 */
3544 	MDI_CLIENT_STABLE(ct);
3545 	if (rv == MDI_SUCCESS) {
3546 		if (ct->ct_unstable == 0) {
3547 			cdip = ct->ct_dip;
3548 
3549 			/*
3550 			 * Onlining the mdi_pathinfo node will impact the
3551 			 * client state Update the client and dev_info node
3552 			 * state accordingly
3553 			 */
3554 			rv = NDI_SUCCESS;
3555 			i_mdi_client_update_state(ct);
3556 			switch (MDI_CLIENT_STATE(ct)) {
3557 			case MDI_CLIENT_STATE_OPTIMAL:
3558 			case MDI_CLIENT_STATE_DEGRADED:
3559 				if (cdip && !i_ddi_devi_attached(cdip) &&
3560 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3561 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3562 
3563 					/*
3564 					 * Must do ndi_devi_online() through
3565 					 * hotplug thread for deferred
3566 					 * attach mechanism to work
3567 					 */
3568 					MDI_CLIENT_UNLOCK(ct);
3569 					rv = ndi_devi_online(cdip, 0);
3570 					MDI_CLIENT_LOCK(ct);
3571 					if ((rv != NDI_SUCCESS) &&
3572 					    (MDI_CLIENT_STATE(ct) ==
3573 					    MDI_CLIENT_STATE_DEGRADED)) {
3574 						/*
3575 						 * ndi_devi_online failed.
3576 						 * Reset client flags to
3577 						 * offline.
3578 						 */
3579 						MDI_DEBUG(1, (CE_WARN, cdip,
3580 						    "!ndi_devi_online: failed "
3581 						    " Error: %x", rv));
3582 						MDI_CLIENT_SET_OFFLINE(ct);
3583 					}
3584 					if (rv != NDI_SUCCESS) {
3585 						/* Reset the path state */
3586 						MDI_PI_LOCK(pip);
3587 						MDI_PI(pip)->pi_state =
3588 						    MDI_PI_OLD_STATE(pip);
3589 						MDI_PI_UNLOCK(pip);
3590 					}
3591 				}
3592 				break;
3593 
3594 			case MDI_CLIENT_STATE_FAILED:
3595 				/*
3596 				 * This is the last path case for
3597 				 * non-user initiated events.
3598 				 */
3599 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3600 				    cdip && (i_ddi_node_state(cdip) >=
3601 				    DS_INITIALIZED)) {
3602 					MDI_CLIENT_UNLOCK(ct);
3603 					rv = ndi_devi_offline(cdip, 0);
3604 					MDI_CLIENT_LOCK(ct);
3605 
3606 					if (rv != NDI_SUCCESS) {
3607 						/*
3608 						 * ndi_devi_offline failed.
3609 						 * Reset client flags to
3610 						 * online as the path could not
3611 						 * be offlined.
3612 						 */
3613 						MDI_DEBUG(1, (CE_WARN, cdip,
3614 						    "!ndi_devi_offline: failed "
3615 						    " Error: %x", rv));
3616 						MDI_CLIENT_SET_ONLINE(ct);
3617 					}
3618 				}
3619 				break;
3620 			}
3621 			/*
3622 			 * Convert to MDI error code
3623 			 */
3624 			switch (rv) {
3625 			case NDI_SUCCESS:
3626 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3627 				i_mdi_report_path_state(ct, pip);
3628 				rv = MDI_SUCCESS;
3629 				break;
3630 			case NDI_BUSY:
3631 				rv = MDI_BUSY;
3632 				break;
3633 			default:
3634 				rv = MDI_FAILURE;
3635 				break;
3636 			}
3637 		}
3638 	}
3639 	MDI_CLIENT_UNLOCK(ct);
3640 
3641 state_change_exit:
3642 	/*
3643 	 * Mark the pHCI as stable again.
3644 	 */
3645 	MDI_PHCI_LOCK(ph);
3646 	MDI_PHCI_STABLE(ph);
3647 	MDI_PHCI_UNLOCK(ph);
3648 	return (rv);
3649 }
3650 
3651 /*
3652  * mdi_pi_online():
3653  *		Place the path_info node in the online state.  The path is
3654  *		now available to be selected by mdi_select_path() for
3655  *		transporting I/O requests to client devices.
3656  * Return Values:
3657  *		MDI_SUCCESS
3658  *		MDI_FAILURE
3659  */
3660 int
3661 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3662 {
3663 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3664 	int		client_held = 0;
3665 	int		rv;
3666 	int		se_flag;
3667 	int		kmem_flag;
3668 
3669 	ASSERT(ct != NULL);
3670 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3671 	if (rv != MDI_SUCCESS)
3672 		return (rv);
3673 
3674 	MDI_PI_LOCK(pip);
3675 	if (MDI_PI(pip)->pi_pm_held == 0) {
3676 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3677 		    "i_mdi_pm_hold_pip %p\n", (void *)pip));
3678 		i_mdi_pm_hold_pip(pip);
3679 		client_held = 1;
3680 	}
3681 	MDI_PI_UNLOCK(pip);
3682 
3683 	if (client_held) {
3684 		MDI_CLIENT_LOCK(ct);
3685 		if (ct->ct_power_cnt == 0) {
3686 			rv = i_mdi_power_all_phci(ct);
3687 		}
3688 
3689 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3690 		    "i_mdi_pm_hold_client %p\n", (void *)ct));
3691 		i_mdi_pm_hold_client(ct, 1);
3692 		MDI_CLIENT_UNLOCK(ct);
3693 	}
3694 
3695 	/* determine interrupt context */
3696 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3697 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3698 
3699 	/* A new path is online.  Invalidate DINFOCACHE snap shot. */
3700 	i_ddi_di_cache_invalidate(kmem_flag);
3701 
3702 	return (rv);
3703 }
3704 
3705 /*
3706  * mdi_pi_standby():
3707  *		Place the mdi_pathinfo node in standby state
3708  *
3709  * Return Values:
3710  *		MDI_SUCCESS
3711  *		MDI_FAILURE
3712  */
3713 int
3714 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3715 {
3716 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3717 }
3718 
3719 /*
3720  * mdi_pi_fault():
3721  *		Place the mdi_pathinfo node in fault'ed state
3722  * Return Values:
3723  *		MDI_SUCCESS
3724  *		MDI_FAILURE
3725  */
3726 int
3727 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3728 {
3729 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3730 }
3731 
3732 /*
3733  * mdi_pi_offline():
3734  *		Offline a mdi_pathinfo node.
3735  * Return Values:
3736  *		MDI_SUCCESS
3737  *		MDI_FAILURE
3738  */
3739 int
3740 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3741 {
3742 	int	ret, client_held = 0;
3743 	mdi_client_t	*ct;
3744 	int		se_flag;
3745 	int		kmem_flag;
3746 
3747 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3748 
3749 	if (ret == MDI_SUCCESS) {
3750 		MDI_PI_LOCK(pip);
3751 		if (MDI_PI(pip)->pi_pm_held) {
3752 			client_held = 1;
3753 		}
3754 		MDI_PI_UNLOCK(pip);
3755 
3756 		if (client_held) {
3757 			ct = MDI_PI(pip)->pi_client;
3758 			MDI_CLIENT_LOCK(ct);
3759 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3760 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3761 			i_mdi_pm_rele_client(ct, 1);
3762 			MDI_CLIENT_UNLOCK(ct);
3763 		}
3764 
3765 		/* determine interrupt context */
3766 		se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3767 		kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3768 
3769 		/* pathinfo is offlined. update DINFOCACHE. */
3770 		i_ddi_di_cache_invalidate(kmem_flag);
3771 	}
3772 
3773 	return (ret);
3774 }
3775 
3776 /*
3777  * i_mdi_pi_offline():
3778  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3779  */
3780 static int
3781 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3782 {
3783 	dev_info_t	*vdip = NULL;
3784 	mdi_vhci_t	*vh = NULL;
3785 	mdi_client_t	*ct = NULL;
3786 	int		(*f)();
3787 	int		rv;
3788 
3789 	MDI_PI_LOCK(pip);
3790 	ct = MDI_PI(pip)->pi_client;
3791 	ASSERT(ct != NULL);
3792 
3793 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3794 		/*
3795 		 * Give a chance for pending I/Os to complete.
3796 		 */
3797 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3798 		    "%d cmds still pending on path: %p\n",
3799 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3800 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3801 		    &MDI_PI(pip)->pi_mutex,
3802 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3803 			/*
3804 			 * The timeout time reached without ref_cnt being zero
3805 			 * being signaled.
3806 			 */
3807 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3808 			    "Timeout reached on path %p without the cond\n",
3809 			    (void *)pip));
3810 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3811 			    "%d cmds still pending on path: %p\n",
3812 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3813 		}
3814 	}
3815 	vh = ct->ct_vhci;
3816 	vdip = vh->vh_dip;
3817 
3818 	/*
3819 	 * Notify vHCI that has registered this event
3820 	 */
3821 	ASSERT(vh->vh_ops);
3822 	f = vh->vh_ops->vo_pi_state_change;
3823 
3824 	if (f != NULL) {
3825 		MDI_PI_UNLOCK(pip);
3826 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3827 		    flags)) != MDI_SUCCESS) {
3828 			MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3829 			    "!vo_path_offline failed "
3830 			    "vdip %p, pip %p", (void *)vdip, (void *)pip));
3831 		}
3832 		MDI_PI_LOCK(pip);
3833 	}
3834 
3835 	/*
3836 	 * Set the mdi_pathinfo node state and clear the transient condition
3837 	 */
3838 	MDI_PI_SET_OFFLINE(pip);
3839 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3840 	MDI_PI_UNLOCK(pip);
3841 
3842 	MDI_CLIENT_LOCK(ct);
3843 	if (rv == MDI_SUCCESS) {
3844 		if (ct->ct_unstable == 0) {
3845 			dev_info_t	*cdip = ct->ct_dip;
3846 
3847 			/*
3848 			 * Onlining the mdi_pathinfo node will impact the
3849 			 * client state Update the client and dev_info node
3850 			 * state accordingly
3851 			 */
3852 			i_mdi_client_update_state(ct);
3853 			rv = NDI_SUCCESS;
3854 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3855 				if (cdip &&
3856 				    (i_ddi_node_state(cdip) >=
3857 				    DS_INITIALIZED)) {
3858 					MDI_CLIENT_UNLOCK(ct);
3859 					rv = ndi_devi_offline(cdip, 0);
3860 					MDI_CLIENT_LOCK(ct);
3861 					if (rv != NDI_SUCCESS) {
3862 						/*
3863 						 * ndi_devi_offline failed.
3864 						 * Reset client flags to
3865 						 * online.
3866 						 */
3867 						MDI_DEBUG(4, (CE_WARN, cdip,
3868 						    "!ndi_devi_offline: failed "
3869 						    " Error: %x", rv));
3870 						MDI_CLIENT_SET_ONLINE(ct);
3871 					}
3872 				}
3873 			}
3874 			/*
3875 			 * Convert to MDI error code
3876 			 */
3877 			switch (rv) {
3878 			case NDI_SUCCESS:
3879 				rv = MDI_SUCCESS;
3880 				break;
3881 			case NDI_BUSY:
3882 				rv = MDI_BUSY;
3883 				break;
3884 			default:
3885 				rv = MDI_FAILURE;
3886 				break;
3887 			}
3888 		}
3889 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3890 		i_mdi_report_path_state(ct, pip);
3891 	}
3892 
3893 	MDI_CLIENT_UNLOCK(ct);
3894 
3895 	/*
3896 	 * Change in the mdi_pathinfo node state will impact the client state
3897 	 */
3898 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3899 	    (void *)ct, (void *)pip));
3900 	return (rv);
3901 }
3902 
3903 /*
3904  * mdi_pi_get_node_name():
3905  *              Get the name associated with a mdi_pathinfo node.
3906  *              Since pathinfo nodes are not directly named, we
3907  *              return the node_name of the client.
3908  *
3909  * Return Values:
3910  *              char *
3911  */
3912 char *
3913 mdi_pi_get_node_name(mdi_pathinfo_t *pip)
3914 {
3915 	mdi_client_t    *ct;
3916 
3917 	if (pip == NULL)
3918 		return (NULL);
3919 	ct = MDI_PI(pip)->pi_client;
3920 	if ((ct == NULL) || (ct->ct_dip == NULL))
3921 		return (NULL);
3922 	return (ddi_node_name(ct->ct_dip));
3923 }
3924 
3925 /*
3926  * mdi_pi_get_addr():
3927  *		Get the unit address associated with a mdi_pathinfo node
3928  *
3929  * Return Values:
3930  *		char *
3931  */
3932 char *
3933 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3934 {
3935 	if (pip == NULL)
3936 		return (NULL);
3937 
3938 	return (MDI_PI(pip)->pi_addr);
3939 }
3940 
3941 /*
3942  * mdi_pi_get_path_instance():
3943  *		Get the 'path_instance' of a mdi_pathinfo node
3944  *
3945  * Return Values:
3946  *		path_instance
3947  */
3948 int
3949 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
3950 {
3951 	if (pip == NULL)
3952 		return (0);
3953 
3954 	return (MDI_PI(pip)->pi_path_instance);
3955 }
3956 
3957 /*
3958  * mdi_pi_pathname():
3959  *		Return pointer to path to pathinfo node.
3960  */
3961 char *
3962 mdi_pi_pathname(mdi_pathinfo_t *pip)
3963 {
3964 	if (pip == NULL)
3965 		return (NULL);
3966 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
3967 }
3968 
3969 char *
3970 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
3971 {
3972 	char *obp_path = NULL;
3973 	if ((pip == NULL) || (path == NULL))
3974 		return (NULL);
3975 
3976 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
3977 		(void) strcpy(path, obp_path);
3978 		(void) mdi_prop_free(obp_path);
3979 	} else {
3980 		path = NULL;
3981 	}
3982 	return (path);
3983 }
3984 
3985 int
3986 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
3987 {
3988 	dev_info_t *pdip;
3989 	char *obp_path = NULL;
3990 	int rc = MDI_FAILURE;
3991 
3992 	if (pip == NULL)
3993 		return (MDI_FAILURE);
3994 
3995 	pdip = mdi_pi_get_phci(pip);
3996 	if (pdip == NULL)
3997 		return (MDI_FAILURE);
3998 
3999 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4000 
4001 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4002 		(void) ddi_pathname(pdip, obp_path);
4003 	}
4004 
4005 	if (component) {
4006 		(void) strncat(obp_path, "/", MAXPATHLEN);
4007 		(void) strncat(obp_path, component, MAXPATHLEN);
4008 	}
4009 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4010 
4011 	if (obp_path)
4012 		kmem_free(obp_path, MAXPATHLEN);
4013 	return (rc);
4014 }
4015 
4016 /*
4017  * mdi_pi_get_client():
4018  *		Get the client devinfo associated with a mdi_pathinfo node
4019  *
4020  * Return Values:
4021  *		Handle to client device dev_info node
4022  */
4023 dev_info_t *
4024 mdi_pi_get_client(mdi_pathinfo_t *pip)
4025 {
4026 	dev_info_t	*dip = NULL;
4027 	if (pip) {
4028 		dip = MDI_PI(pip)->pi_client->ct_dip;
4029 	}
4030 	return (dip);
4031 }
4032 
4033 /*
4034  * mdi_pi_get_phci():
4035  *		Get the pHCI devinfo associated with the mdi_pathinfo node
4036  * Return Values:
4037  *		Handle to dev_info node
4038  */
4039 dev_info_t *
4040 mdi_pi_get_phci(mdi_pathinfo_t *pip)
4041 {
4042 	dev_info_t	*dip = NULL;
4043 	if (pip) {
4044 		dip = MDI_PI(pip)->pi_phci->ph_dip;
4045 	}
4046 	return (dip);
4047 }
4048 
4049 /*
4050  * mdi_pi_get_client_private():
4051  *		Get the client private information associated with the
4052  *		mdi_pathinfo node
4053  */
4054 void *
4055 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4056 {
4057 	void *cprivate = NULL;
4058 	if (pip) {
4059 		cprivate = MDI_PI(pip)->pi_cprivate;
4060 	}
4061 	return (cprivate);
4062 }
4063 
4064 /*
4065  * mdi_pi_set_client_private():
4066  *		Set the client private information in the mdi_pathinfo node
4067  */
4068 void
4069 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4070 {
4071 	if (pip) {
4072 		MDI_PI(pip)->pi_cprivate = priv;
4073 	}
4074 }
4075 
4076 /*
4077  * mdi_pi_get_phci_private():
4078  *		Get the pHCI private information associated with the
4079  *		mdi_pathinfo node
4080  */
4081 caddr_t
4082 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4083 {
4084 	caddr_t	pprivate = NULL;
4085 	if (pip) {
4086 		pprivate = MDI_PI(pip)->pi_pprivate;
4087 	}
4088 	return (pprivate);
4089 }
4090 
4091 /*
4092  * mdi_pi_set_phci_private():
4093  *		Set the pHCI private information in the mdi_pathinfo node
4094  */
4095 void
4096 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4097 {
4098 	if (pip) {
4099 		MDI_PI(pip)->pi_pprivate = priv;
4100 	}
4101 }
4102 
4103 /*
4104  * mdi_pi_get_state():
4105  *		Get the mdi_pathinfo node state. Transient states are internal
4106  *		and not provided to the users
4107  */
4108 mdi_pathinfo_state_t
4109 mdi_pi_get_state(mdi_pathinfo_t *pip)
4110 {
4111 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4112 
4113 	if (pip) {
4114 		if (MDI_PI_IS_TRANSIENT(pip)) {
4115 			/*
4116 			 * mdi_pathinfo is in state transition.  Return the
4117 			 * last good state.
4118 			 */
4119 			state = MDI_PI_OLD_STATE(pip);
4120 		} else {
4121 			state = MDI_PI_STATE(pip);
4122 		}
4123 	}
4124 	return (state);
4125 }
4126 
4127 /*
4128  * Note that the following function needs to be the new interface for
4129  * mdi_pi_get_state when mpxio gets integrated to ON.
4130  */
4131 int
4132 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4133 		uint32_t *ext_state)
4134 {
4135 	*state = MDI_PATHINFO_STATE_INIT;
4136 
4137 	if (pip) {
4138 		if (MDI_PI_IS_TRANSIENT(pip)) {
4139 			/*
4140 			 * mdi_pathinfo is in state transition.  Return the
4141 			 * last good state.
4142 			 */
4143 			*state = MDI_PI_OLD_STATE(pip);
4144 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4145 		} else {
4146 			*state = MDI_PI_STATE(pip);
4147 			*ext_state = MDI_PI_EXT_STATE(pip);
4148 		}
4149 	}
4150 	return (MDI_SUCCESS);
4151 }
4152 
4153 /*
4154  * mdi_pi_get_preferred:
4155  *	Get the preferred path flag
4156  */
4157 int
4158 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4159 {
4160 	if (pip) {
4161 		return (MDI_PI(pip)->pi_preferred);
4162 	}
4163 	return (0);
4164 }
4165 
4166 /*
4167  * mdi_pi_set_preferred:
4168  *	Set the preferred path flag
4169  */
4170 void
4171 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4172 {
4173 	if (pip) {
4174 		MDI_PI(pip)->pi_preferred = preferred;
4175 	}
4176 }
4177 
4178 /*
4179  * mdi_pi_set_state():
4180  *		Set the mdi_pathinfo node state
4181  */
4182 void
4183 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4184 {
4185 	uint32_t	ext_state;
4186 
4187 	if (pip) {
4188 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4189 		MDI_PI(pip)->pi_state = state;
4190 		MDI_PI(pip)->pi_state |= ext_state;
4191 	}
4192 }
4193 
4194 /*
4195  * Property functions:
4196  */
4197 int
4198 i_map_nvlist_error_to_mdi(int val)
4199 {
4200 	int rv;
4201 
4202 	switch (val) {
4203 	case 0:
4204 		rv = DDI_PROP_SUCCESS;
4205 		break;
4206 	case EINVAL:
4207 	case ENOTSUP:
4208 		rv = DDI_PROP_INVAL_ARG;
4209 		break;
4210 	case ENOMEM:
4211 		rv = DDI_PROP_NO_MEMORY;
4212 		break;
4213 	default:
4214 		rv = DDI_PROP_NOT_FOUND;
4215 		break;
4216 	}
4217 	return (rv);
4218 }
4219 
4220 /*
4221  * mdi_pi_get_next_prop():
4222  * 		Property walk function.  The caller should hold mdi_pi_lock()
4223  *		and release by calling mdi_pi_unlock() at the end of walk to
4224  *		get a consistent value.
4225  */
4226 nvpair_t *
4227 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4228 {
4229 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4230 		return (NULL);
4231 	}
4232 	ASSERT(MDI_PI_LOCKED(pip));
4233 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4234 }
4235 
4236 /*
4237  * mdi_prop_remove():
4238  * 		Remove the named property from the named list.
4239  */
4240 int
4241 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4242 {
4243 	if (pip == NULL) {
4244 		return (DDI_PROP_NOT_FOUND);
4245 	}
4246 	ASSERT(!MDI_PI_LOCKED(pip));
4247 	MDI_PI_LOCK(pip);
4248 	if (MDI_PI(pip)->pi_prop == NULL) {
4249 		MDI_PI_UNLOCK(pip);
4250 		return (DDI_PROP_NOT_FOUND);
4251 	}
4252 	if (name) {
4253 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4254 	} else {
4255 		char		nvp_name[MAXNAMELEN];
4256 		nvpair_t	*nvp;
4257 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4258 		while (nvp) {
4259 			nvpair_t	*next;
4260 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4261 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
4262 			    nvpair_name(nvp));
4263 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4264 			    nvp_name);
4265 			nvp = next;
4266 		}
4267 	}
4268 	MDI_PI_UNLOCK(pip);
4269 	return (DDI_PROP_SUCCESS);
4270 }
4271 
4272 /*
4273  * mdi_prop_size():
4274  * 		Get buffer size needed to pack the property data.
4275  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4276  *		buffer size.
4277  */
4278 int
4279 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4280 {
4281 	int	rv;
4282 	size_t	bufsize;
4283 
4284 	*buflenp = 0;
4285 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4286 		return (DDI_PROP_NOT_FOUND);
4287 	}
4288 	ASSERT(MDI_PI_LOCKED(pip));
4289 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4290 	    &bufsize, NV_ENCODE_NATIVE);
4291 	*buflenp = bufsize;
4292 	return (i_map_nvlist_error_to_mdi(rv));
4293 }
4294 
4295 /*
4296  * mdi_prop_pack():
4297  * 		pack the property list.  The caller should hold the
4298  *		mdi_pathinfo_t node to get a consistent data
4299  */
4300 int
4301 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4302 {
4303 	int	rv;
4304 	size_t	bufsize;
4305 
4306 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4307 		return (DDI_PROP_NOT_FOUND);
4308 	}
4309 
4310 	ASSERT(MDI_PI_LOCKED(pip));
4311 
4312 	bufsize = buflen;
4313 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4314 	    NV_ENCODE_NATIVE, KM_SLEEP);
4315 
4316 	return (i_map_nvlist_error_to_mdi(rv));
4317 }
4318 
4319 /*
4320  * mdi_prop_update_byte():
4321  *		Create/Update a byte property
4322  */
4323 int
4324 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4325 {
4326 	int rv;
4327 
4328 	if (pip == NULL) {
4329 		return (DDI_PROP_INVAL_ARG);
4330 	}
4331 	ASSERT(!MDI_PI_LOCKED(pip));
4332 	MDI_PI_LOCK(pip);
4333 	if (MDI_PI(pip)->pi_prop == NULL) {
4334 		MDI_PI_UNLOCK(pip);
4335 		return (DDI_PROP_NOT_FOUND);
4336 	}
4337 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4338 	MDI_PI_UNLOCK(pip);
4339 	return (i_map_nvlist_error_to_mdi(rv));
4340 }
4341 
4342 /*
4343  * mdi_prop_update_byte_array():
4344  *		Create/Update a byte array property
4345  */
4346 int
4347 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4348     uint_t nelements)
4349 {
4350 	int rv;
4351 
4352 	if (pip == NULL) {
4353 		return (DDI_PROP_INVAL_ARG);
4354 	}
4355 	ASSERT(!MDI_PI_LOCKED(pip));
4356 	MDI_PI_LOCK(pip);
4357 	if (MDI_PI(pip)->pi_prop == NULL) {
4358 		MDI_PI_UNLOCK(pip);
4359 		return (DDI_PROP_NOT_FOUND);
4360 	}
4361 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4362 	MDI_PI_UNLOCK(pip);
4363 	return (i_map_nvlist_error_to_mdi(rv));
4364 }
4365 
4366 /*
4367  * mdi_prop_update_int():
4368  *		Create/Update a 32 bit integer property
4369  */
4370 int
4371 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4372 {
4373 	int rv;
4374 
4375 	if (pip == NULL) {
4376 		return (DDI_PROP_INVAL_ARG);
4377 	}
4378 	ASSERT(!MDI_PI_LOCKED(pip));
4379 	MDI_PI_LOCK(pip);
4380 	if (MDI_PI(pip)->pi_prop == NULL) {
4381 		MDI_PI_UNLOCK(pip);
4382 		return (DDI_PROP_NOT_FOUND);
4383 	}
4384 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4385 	MDI_PI_UNLOCK(pip);
4386 	return (i_map_nvlist_error_to_mdi(rv));
4387 }
4388 
4389 /*
4390  * mdi_prop_update_int64():
4391  *		Create/Update a 64 bit integer property
4392  */
4393 int
4394 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4395 {
4396 	int rv;
4397 
4398 	if (pip == NULL) {
4399 		return (DDI_PROP_INVAL_ARG);
4400 	}
4401 	ASSERT(!MDI_PI_LOCKED(pip));
4402 	MDI_PI_LOCK(pip);
4403 	if (MDI_PI(pip)->pi_prop == NULL) {
4404 		MDI_PI_UNLOCK(pip);
4405 		return (DDI_PROP_NOT_FOUND);
4406 	}
4407 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4408 	MDI_PI_UNLOCK(pip);
4409 	return (i_map_nvlist_error_to_mdi(rv));
4410 }
4411 
4412 /*
4413  * mdi_prop_update_int_array():
4414  *		Create/Update a int array property
4415  */
4416 int
4417 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4418 	    uint_t nelements)
4419 {
4420 	int rv;
4421 
4422 	if (pip == NULL) {
4423 		return (DDI_PROP_INVAL_ARG);
4424 	}
4425 	ASSERT(!MDI_PI_LOCKED(pip));
4426 	MDI_PI_LOCK(pip);
4427 	if (MDI_PI(pip)->pi_prop == NULL) {
4428 		MDI_PI_UNLOCK(pip);
4429 		return (DDI_PROP_NOT_FOUND);
4430 	}
4431 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4432 	    nelements);
4433 	MDI_PI_UNLOCK(pip);
4434 	return (i_map_nvlist_error_to_mdi(rv));
4435 }
4436 
4437 /*
4438  * mdi_prop_update_string():
4439  *		Create/Update a string property
4440  */
4441 int
4442 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4443 {
4444 	int rv;
4445 
4446 	if (pip == NULL) {
4447 		return (DDI_PROP_INVAL_ARG);
4448 	}
4449 	ASSERT(!MDI_PI_LOCKED(pip));
4450 	MDI_PI_LOCK(pip);
4451 	if (MDI_PI(pip)->pi_prop == NULL) {
4452 		MDI_PI_UNLOCK(pip);
4453 		return (DDI_PROP_NOT_FOUND);
4454 	}
4455 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4456 	MDI_PI_UNLOCK(pip);
4457 	return (i_map_nvlist_error_to_mdi(rv));
4458 }
4459 
4460 /*
4461  * mdi_prop_update_string_array():
4462  *		Create/Update a string array property
4463  */
4464 int
4465 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4466     uint_t nelements)
4467 {
4468 	int rv;
4469 
4470 	if (pip == NULL) {
4471 		return (DDI_PROP_INVAL_ARG);
4472 	}
4473 	ASSERT(!MDI_PI_LOCKED(pip));
4474 	MDI_PI_LOCK(pip);
4475 	if (MDI_PI(pip)->pi_prop == NULL) {
4476 		MDI_PI_UNLOCK(pip);
4477 		return (DDI_PROP_NOT_FOUND);
4478 	}
4479 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4480 	    nelements);
4481 	MDI_PI_UNLOCK(pip);
4482 	return (i_map_nvlist_error_to_mdi(rv));
4483 }
4484 
4485 /*
4486  * mdi_prop_lookup_byte():
4487  * 		Look for byte property identified by name.  The data returned
4488  *		is the actual property and valid as long as mdi_pathinfo_t node
4489  *		is alive.
4490  */
4491 int
4492 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4493 {
4494 	int rv;
4495 
4496 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4497 		return (DDI_PROP_NOT_FOUND);
4498 	}
4499 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4500 	return (i_map_nvlist_error_to_mdi(rv));
4501 }
4502 
4503 
4504 /*
4505  * mdi_prop_lookup_byte_array():
4506  * 		Look for byte array property identified by name.  The data
4507  *		returned is the actual property and valid as long as
4508  *		mdi_pathinfo_t node is alive.
4509  */
4510 int
4511 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4512     uint_t *nelements)
4513 {
4514 	int rv;
4515 
4516 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4517 		return (DDI_PROP_NOT_FOUND);
4518 	}
4519 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4520 	    nelements);
4521 	return (i_map_nvlist_error_to_mdi(rv));
4522 }
4523 
4524 /*
4525  * mdi_prop_lookup_int():
4526  * 		Look for int property identified by name.  The data returned
4527  *		is the actual property and valid as long as mdi_pathinfo_t
4528  *		node is alive.
4529  */
4530 int
4531 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4532 {
4533 	int rv;
4534 
4535 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4536 		return (DDI_PROP_NOT_FOUND);
4537 	}
4538 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4539 	return (i_map_nvlist_error_to_mdi(rv));
4540 }
4541 
4542 /*
4543  * mdi_prop_lookup_int64():
4544  * 		Look for int64 property identified by name.  The data returned
4545  *		is the actual property and valid as long as mdi_pathinfo_t node
4546  *		is alive.
4547  */
4548 int
4549 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4550 {
4551 	int rv;
4552 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4553 		return (DDI_PROP_NOT_FOUND);
4554 	}
4555 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4556 	return (i_map_nvlist_error_to_mdi(rv));
4557 }
4558 
4559 /*
4560  * mdi_prop_lookup_int_array():
4561  * 		Look for int array property identified by name.  The data
4562  *		returned is the actual property and valid as long as
4563  *		mdi_pathinfo_t node is alive.
4564  */
4565 int
4566 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4567     uint_t *nelements)
4568 {
4569 	int rv;
4570 
4571 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4572 		return (DDI_PROP_NOT_FOUND);
4573 	}
4574 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4575 	    (int32_t **)data, nelements);
4576 	return (i_map_nvlist_error_to_mdi(rv));
4577 }
4578 
4579 /*
4580  * mdi_prop_lookup_string():
4581  * 		Look for string property identified by name.  The data
4582  *		returned is the actual property and valid as long as
4583  *		mdi_pathinfo_t node is alive.
4584  */
4585 int
4586 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4587 {
4588 	int rv;
4589 
4590 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4591 		return (DDI_PROP_NOT_FOUND);
4592 	}
4593 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4594 	return (i_map_nvlist_error_to_mdi(rv));
4595 }
4596 
4597 /*
4598  * mdi_prop_lookup_string_array():
4599  * 		Look for string array property identified by name.  The data
4600  *		returned is the actual property and valid as long as
4601  *		mdi_pathinfo_t node is alive.
4602  */
4603 int
4604 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4605     uint_t *nelements)
4606 {
4607 	int rv;
4608 
4609 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4610 		return (DDI_PROP_NOT_FOUND);
4611 	}
4612 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4613 	    nelements);
4614 	return (i_map_nvlist_error_to_mdi(rv));
4615 }
4616 
4617 /*
4618  * mdi_prop_free():
4619  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4620  *		functions return the pointer to actual property data and not a
4621  *		copy of it.  So the data returned is valid as long as
4622  *		mdi_pathinfo_t node is valid.
4623  */
4624 /*ARGSUSED*/
4625 int
4626 mdi_prop_free(void *data)
4627 {
4628 	return (DDI_PROP_SUCCESS);
4629 }
4630 
4631 /*ARGSUSED*/
4632 static void
4633 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4634 {
4635 	char		*phci_path, *ct_path;
4636 	char		*ct_status;
4637 	char		*status;
4638 	dev_info_t	*dip = ct->ct_dip;
4639 	char		lb_buf[64];
4640 
4641 	ASSERT(MDI_CLIENT_LOCKED(ct));
4642 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4643 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4644 		return;
4645 	}
4646 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4647 		ct_status = "optimal";
4648 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4649 		ct_status = "degraded";
4650 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4651 		ct_status = "failed";
4652 	} else {
4653 		ct_status = "unknown";
4654 	}
4655 
4656 	if (MDI_PI_IS_OFFLINE(pip)) {
4657 		status = "offline";
4658 	} else if (MDI_PI_IS_ONLINE(pip)) {
4659 		status = "online";
4660 	} else if (MDI_PI_IS_STANDBY(pip)) {
4661 		status = "standby";
4662 	} else if (MDI_PI_IS_FAULT(pip)) {
4663 		status = "faulted";
4664 	} else {
4665 		status = "unknown";
4666 	}
4667 
4668 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4669 		(void) snprintf(lb_buf, sizeof (lb_buf),
4670 		    "%s, region-size: %d", mdi_load_balance_lba,
4671 			ct->ct_lb_args->region_size);
4672 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4673 		(void) snprintf(lb_buf, sizeof (lb_buf),
4674 		    "%s", mdi_load_balance_none);
4675 	} else {
4676 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4677 		    mdi_load_balance_rr);
4678 	}
4679 
4680 	if (dip) {
4681 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4682 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4683 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4684 		    "path %s (%s%d) to target address: %s is %s"
4685 		    " Load balancing: %s\n",
4686 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4687 		    ddi_get_instance(dip), ct_status,
4688 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4689 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4690 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4691 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4692 		kmem_free(phci_path, MAXPATHLEN);
4693 		kmem_free(ct_path, MAXPATHLEN);
4694 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4695 	}
4696 }
4697 
4698 #ifdef	DEBUG
4699 /*
4700  * i_mdi_log():
4701  *		Utility function for error message management
4702  *
4703  */
4704 /*PRINTFLIKE3*/
4705 static void
4706 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4707 {
4708 	char		name[MAXNAMELEN];
4709 	char		buf[MAXNAMELEN];
4710 	char		*bp;
4711 	va_list		ap;
4712 	int		log_only = 0;
4713 	int		boot_only = 0;
4714 	int		console_only = 0;
4715 
4716 	if (dip) {
4717 		(void) snprintf(name, MAXNAMELEN, "%s%d: ",
4718 		    ddi_driver_name(dip), ddi_get_instance(dip));
4719 	} else {
4720 		name[0] = 0;
4721 	}
4722 
4723 	va_start(ap, fmt);
4724 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4725 	va_end(ap);
4726 
4727 	switch (buf[0]) {
4728 	case '!':
4729 		bp = &buf[1];
4730 		log_only = 1;
4731 		break;
4732 	case '?':
4733 		bp = &buf[1];
4734 		boot_only = 1;
4735 		break;
4736 	case '^':
4737 		bp = &buf[1];
4738 		console_only = 1;
4739 		break;
4740 	default:
4741 		bp = buf;
4742 		break;
4743 	}
4744 	if (mdi_debug_logonly) {
4745 		log_only = 1;
4746 		boot_only = 0;
4747 		console_only = 0;
4748 	}
4749 
4750 	switch (level) {
4751 	case CE_NOTE:
4752 		level = CE_CONT;
4753 		/* FALLTHROUGH */
4754 	case CE_CONT:
4755 	case CE_WARN:
4756 	case CE_PANIC:
4757 		if (boot_only) {
4758 			cmn_err(level, "?mdi: %s%s", name, bp);
4759 		} else if (console_only) {
4760 			cmn_err(level, "^mdi: %s%s", name, bp);
4761 		} else if (log_only) {
4762 			cmn_err(level, "!mdi: %s%s", name, bp);
4763 		} else {
4764 			cmn_err(level, "mdi: %s%s", name, bp);
4765 		}
4766 		break;
4767 	default:
4768 		cmn_err(level, "mdi: %s%s", name, bp);
4769 		break;
4770 	}
4771 }
4772 #endif	/* DEBUG */
4773 
4774 void
4775 i_mdi_client_online(dev_info_t *ct_dip)
4776 {
4777 	mdi_client_t	*ct;
4778 
4779 	/*
4780 	 * Client online notification. Mark client state as online
4781 	 * restore our binding with dev_info node
4782 	 */
4783 	ct = i_devi_get_client(ct_dip);
4784 	ASSERT(ct != NULL);
4785 	MDI_CLIENT_LOCK(ct);
4786 	MDI_CLIENT_SET_ONLINE(ct);
4787 	/* catch for any memory leaks */
4788 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4789 	ct->ct_dip = ct_dip;
4790 
4791 	if (ct->ct_power_cnt == 0)
4792 		(void) i_mdi_power_all_phci(ct);
4793 
4794 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4795 	    "i_mdi_pm_hold_client %p\n", (void *)ct));
4796 	i_mdi_pm_hold_client(ct, 1);
4797 
4798 	MDI_CLIENT_UNLOCK(ct);
4799 }
4800 
4801 void
4802 i_mdi_phci_online(dev_info_t *ph_dip)
4803 {
4804 	mdi_phci_t	*ph;
4805 
4806 	/* pHCI online notification. Mark state accordingly */
4807 	ph = i_devi_get_phci(ph_dip);
4808 	ASSERT(ph != NULL);
4809 	MDI_PHCI_LOCK(ph);
4810 	MDI_PHCI_SET_ONLINE(ph);
4811 	MDI_PHCI_UNLOCK(ph);
4812 }
4813 
4814 /*
4815  * mdi_devi_online():
4816  * 		Online notification from NDI framework on pHCI/client
4817  *		device online.
4818  * Return Values:
4819  *		NDI_SUCCESS
4820  *		MDI_FAILURE
4821  */
4822 /*ARGSUSED*/
4823 int
4824 mdi_devi_online(dev_info_t *dip, uint_t flags)
4825 {
4826 	if (MDI_PHCI(dip)) {
4827 		i_mdi_phci_online(dip);
4828 	}
4829 
4830 	if (MDI_CLIENT(dip)) {
4831 		i_mdi_client_online(dip);
4832 	}
4833 	return (NDI_SUCCESS);
4834 }
4835 
4836 /*
4837  * mdi_devi_offline():
4838  * 		Offline notification from NDI framework on pHCI/Client device
4839  *		offline.
4840  *
4841  * Return Values:
4842  *		NDI_SUCCESS
4843  *		NDI_FAILURE
4844  */
4845 /*ARGSUSED*/
4846 int
4847 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4848 {
4849 	int		rv = NDI_SUCCESS;
4850 
4851 	if (MDI_CLIENT(dip)) {
4852 		rv = i_mdi_client_offline(dip, flags);
4853 		if (rv != NDI_SUCCESS)
4854 			return (rv);
4855 	}
4856 
4857 	if (MDI_PHCI(dip)) {
4858 		rv = i_mdi_phci_offline(dip, flags);
4859 
4860 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4861 			/* set client back online */
4862 			i_mdi_client_online(dip);
4863 		}
4864 	}
4865 
4866 	return (rv);
4867 }
4868 
4869 /*ARGSUSED*/
4870 static int
4871 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4872 {
4873 	int		rv = NDI_SUCCESS;
4874 	mdi_phci_t	*ph;
4875 	mdi_client_t	*ct;
4876 	mdi_pathinfo_t	*pip;
4877 	mdi_pathinfo_t	*next;
4878 	mdi_pathinfo_t	*failed_pip = NULL;
4879 	dev_info_t	*cdip;
4880 
4881 	/*
4882 	 * pHCI component offline notification
4883 	 * Make sure that this pHCI instance is free to be offlined.
4884 	 * If it is OK to proceed, Offline and remove all the child
4885 	 * mdi_pathinfo nodes.  This process automatically offlines
4886 	 * corresponding client devices, for which this pHCI provides
4887 	 * critical services.
4888 	 */
4889 	ph = i_devi_get_phci(dip);
4890 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p %p\n",
4891 	    (void *)dip, (void *)ph));
4892 	if (ph == NULL) {
4893 		return (rv);
4894 	}
4895 
4896 	MDI_PHCI_LOCK(ph);
4897 
4898 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4899 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined",
4900 		    (void *)ph));
4901 		MDI_PHCI_UNLOCK(ph);
4902 		return (NDI_SUCCESS);
4903 	}
4904 
4905 	/*
4906 	 * Check to see if the pHCI can be offlined
4907 	 */
4908 	if (ph->ph_unstable) {
4909 		MDI_DEBUG(1, (CE_WARN, dip,
4910 		    "!One or more target devices are in transient "
4911 		    "state. This device can not be removed at "
4912 		    "this moment. Please try again later."));
4913 		MDI_PHCI_UNLOCK(ph);
4914 		return (NDI_BUSY);
4915 	}
4916 
4917 	pip = ph->ph_path_head;
4918 	while (pip != NULL) {
4919 		MDI_PI_LOCK(pip);
4920 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4921 
4922 		/*
4923 		 * The mdi_pathinfo state is OK. Check the client state.
4924 		 * If failover in progress fail the pHCI from offlining
4925 		 */
4926 		ct = MDI_PI(pip)->pi_client;
4927 		i_mdi_client_lock(ct, pip);
4928 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4929 		    (ct->ct_unstable)) {
4930 			/*
4931 			 * Failover is in progress, Fail the DR
4932 			 */
4933 			MDI_DEBUG(1, (CE_WARN, dip,
4934 			    "!pHCI device (%s%d) is Busy. %s",
4935 			    ddi_driver_name(dip), ddi_get_instance(dip),
4936 			    "This device can not be removed at "
4937 			    "this moment. Please try again later."));
4938 			MDI_PI_UNLOCK(pip);
4939 			i_mdi_client_unlock(ct);
4940 			MDI_PHCI_UNLOCK(ph);
4941 			return (NDI_BUSY);
4942 		}
4943 		MDI_PI_UNLOCK(pip);
4944 
4945 		/*
4946 		 * Check to see of we are removing the last path of this
4947 		 * client device...
4948 		 */
4949 		cdip = ct->ct_dip;
4950 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4951 		    (i_mdi_client_compute_state(ct, ph) ==
4952 		    MDI_CLIENT_STATE_FAILED)) {
4953 			i_mdi_client_unlock(ct);
4954 			MDI_PHCI_UNLOCK(ph);
4955 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4956 				/*
4957 				 * ndi_devi_offline() failed.
4958 				 * This pHCI provides the critical path
4959 				 * to one or more client devices.
4960 				 * Return busy.
4961 				 */
4962 				MDI_PHCI_LOCK(ph);
4963 				MDI_DEBUG(1, (CE_WARN, dip,
4964 				    "!pHCI device (%s%d) is Busy. %s",
4965 				    ddi_driver_name(dip), ddi_get_instance(dip),
4966 				    "This device can not be removed at "
4967 				    "this moment. Please try again later."));
4968 				failed_pip = pip;
4969 				break;
4970 			} else {
4971 				MDI_PHCI_LOCK(ph);
4972 				pip = next;
4973 			}
4974 		} else {
4975 			i_mdi_client_unlock(ct);
4976 			pip = next;
4977 		}
4978 	}
4979 
4980 	if (failed_pip) {
4981 		pip = ph->ph_path_head;
4982 		while (pip != failed_pip) {
4983 			MDI_PI_LOCK(pip);
4984 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4985 			ct = MDI_PI(pip)->pi_client;
4986 			i_mdi_client_lock(ct, pip);
4987 			cdip = ct->ct_dip;
4988 			switch (MDI_CLIENT_STATE(ct)) {
4989 			case MDI_CLIENT_STATE_OPTIMAL:
4990 			case MDI_CLIENT_STATE_DEGRADED:
4991 				if (cdip) {
4992 					MDI_PI_UNLOCK(pip);
4993 					i_mdi_client_unlock(ct);
4994 					MDI_PHCI_UNLOCK(ph);
4995 					(void) ndi_devi_online(cdip, 0);
4996 					MDI_PHCI_LOCK(ph);
4997 					pip = next;
4998 					continue;
4999 				}
5000 				break;
5001 
5002 			case MDI_CLIENT_STATE_FAILED:
5003 				if (cdip) {
5004 					MDI_PI_UNLOCK(pip);
5005 					i_mdi_client_unlock(ct);
5006 					MDI_PHCI_UNLOCK(ph);
5007 					(void) ndi_devi_offline(cdip, 0);
5008 					MDI_PHCI_LOCK(ph);
5009 					pip = next;
5010 					continue;
5011 				}
5012 				break;
5013 			}
5014 			MDI_PI_UNLOCK(pip);
5015 			i_mdi_client_unlock(ct);
5016 			pip = next;
5017 		}
5018 		MDI_PHCI_UNLOCK(ph);
5019 		return (NDI_BUSY);
5020 	}
5021 
5022 	/*
5023 	 * Mark the pHCI as offline
5024 	 */
5025 	MDI_PHCI_SET_OFFLINE(ph);
5026 
5027 	/*
5028 	 * Mark the child mdi_pathinfo nodes as transient
5029 	 */
5030 	pip = ph->ph_path_head;
5031 	while (pip != NULL) {
5032 		MDI_PI_LOCK(pip);
5033 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5034 		MDI_PI_SET_OFFLINING(pip);
5035 		MDI_PI_UNLOCK(pip);
5036 		pip = next;
5037 	}
5038 	MDI_PHCI_UNLOCK(ph);
5039 	/*
5040 	 * Give a chance for any pending commands to execute
5041 	 */
5042 	delay(1);
5043 	MDI_PHCI_LOCK(ph);
5044 	pip = ph->ph_path_head;
5045 	while (pip != NULL) {
5046 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5047 		(void) i_mdi_pi_offline(pip, flags);
5048 		MDI_PI_LOCK(pip);
5049 		ct = MDI_PI(pip)->pi_client;
5050 		if (!MDI_PI_IS_OFFLINE(pip)) {
5051 			MDI_DEBUG(1, (CE_WARN, dip,
5052 			    "!pHCI device (%s%d) is Busy. %s",
5053 			    ddi_driver_name(dip), ddi_get_instance(dip),
5054 			    "This device can not be removed at "
5055 			    "this moment. Please try again later."));
5056 			MDI_PI_UNLOCK(pip);
5057 			MDI_PHCI_SET_ONLINE(ph);
5058 			MDI_PHCI_UNLOCK(ph);
5059 			return (NDI_BUSY);
5060 		}
5061 		MDI_PI_UNLOCK(pip);
5062 		pip = next;
5063 	}
5064 	MDI_PHCI_UNLOCK(ph);
5065 
5066 	return (rv);
5067 }
5068 
5069 void
5070 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5071 {
5072 	mdi_phci_t	*ph;
5073 	mdi_client_t	*ct;
5074 	mdi_pathinfo_t	*pip;
5075 	mdi_pathinfo_t	*next;
5076 	dev_info_t	*cdip;
5077 
5078 	if (!MDI_PHCI(dip))
5079 		return;
5080 
5081 	ph = i_devi_get_phci(dip);
5082 	if (ph == NULL) {
5083 		return;
5084 	}
5085 
5086 	MDI_PHCI_LOCK(ph);
5087 
5088 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5089 		/* has no last path */
5090 		MDI_PHCI_UNLOCK(ph);
5091 		return;
5092 	}
5093 
5094 	pip = ph->ph_path_head;
5095 	while (pip != NULL) {
5096 		MDI_PI_LOCK(pip);
5097 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5098 
5099 		ct = MDI_PI(pip)->pi_client;
5100 		i_mdi_client_lock(ct, pip);
5101 		MDI_PI_UNLOCK(pip);
5102 
5103 		cdip = ct->ct_dip;
5104 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5105 		    (i_mdi_client_compute_state(ct, ph) ==
5106 		    MDI_CLIENT_STATE_FAILED)) {
5107 			/* Last path. Mark client dip as retiring */
5108 			i_mdi_client_unlock(ct);
5109 			MDI_PHCI_UNLOCK(ph);
5110 			(void) e_ddi_mark_retiring(cdip, cons_array);
5111 			MDI_PHCI_LOCK(ph);
5112 			pip = next;
5113 		} else {
5114 			i_mdi_client_unlock(ct);
5115 			pip = next;
5116 		}
5117 	}
5118 
5119 	MDI_PHCI_UNLOCK(ph);
5120 
5121 	return;
5122 }
5123 
5124 void
5125 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5126 {
5127 	mdi_phci_t	*ph;
5128 	mdi_client_t	*ct;
5129 	mdi_pathinfo_t	*pip;
5130 	mdi_pathinfo_t	*next;
5131 	dev_info_t	*cdip;
5132 
5133 	if (!MDI_PHCI(dip))
5134 		return;
5135 
5136 	ph = i_devi_get_phci(dip);
5137 	if (ph == NULL)
5138 		return;
5139 
5140 	MDI_PHCI_LOCK(ph);
5141 
5142 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5143 		MDI_PHCI_UNLOCK(ph);
5144 		/* not last path */
5145 		return;
5146 	}
5147 
5148 	if (ph->ph_unstable) {
5149 		MDI_PHCI_UNLOCK(ph);
5150 		/* can't check for constraints */
5151 		*constraint = 0;
5152 		return;
5153 	}
5154 
5155 	pip = ph->ph_path_head;
5156 	while (pip != NULL) {
5157 		MDI_PI_LOCK(pip);
5158 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5159 
5160 		/*
5161 		 * The mdi_pathinfo state is OK. Check the client state.
5162 		 * If failover in progress fail the pHCI from offlining
5163 		 */
5164 		ct = MDI_PI(pip)->pi_client;
5165 		i_mdi_client_lock(ct, pip);
5166 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5167 		    (ct->ct_unstable)) {
5168 			/*
5169 			 * Failover is in progress, can't check for constraints
5170 			 */
5171 			MDI_PI_UNLOCK(pip);
5172 			i_mdi_client_unlock(ct);
5173 			MDI_PHCI_UNLOCK(ph);
5174 			*constraint = 0;
5175 			return;
5176 		}
5177 		MDI_PI_UNLOCK(pip);
5178 
5179 		/*
5180 		 * Check to see of we are retiring the last path of this
5181 		 * client device...
5182 		 */
5183 		cdip = ct->ct_dip;
5184 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5185 		    (i_mdi_client_compute_state(ct, ph) ==
5186 		    MDI_CLIENT_STATE_FAILED)) {
5187 			i_mdi_client_unlock(ct);
5188 			MDI_PHCI_UNLOCK(ph);
5189 			(void) e_ddi_retire_notify(cdip, constraint);
5190 			MDI_PHCI_LOCK(ph);
5191 			pip = next;
5192 		} else {
5193 			i_mdi_client_unlock(ct);
5194 			pip = next;
5195 		}
5196 	}
5197 
5198 	MDI_PHCI_UNLOCK(ph);
5199 
5200 	return;
5201 }
5202 
5203 /*
5204  * offline the path(s) hanging off the PHCI. If the
5205  * last path to any client, check that constraints
5206  * have been applied.
5207  */
5208 void
5209 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
5210 {
5211 	mdi_phci_t	*ph;
5212 	mdi_client_t	*ct;
5213 	mdi_pathinfo_t	*pip;
5214 	mdi_pathinfo_t	*next;
5215 	dev_info_t	*cdip;
5216 	int		unstable = 0;
5217 	int		constraint;
5218 
5219 	if (!MDI_PHCI(dip))
5220 		return;
5221 
5222 	ph = i_devi_get_phci(dip);
5223 	if (ph == NULL) {
5224 		/* no last path and no pips */
5225 		return;
5226 	}
5227 
5228 	MDI_PHCI_LOCK(ph);
5229 
5230 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5231 		MDI_PHCI_UNLOCK(ph);
5232 		/* no last path and no pips */
5233 		return;
5234 	}
5235 
5236 	/*
5237 	 * Check to see if the pHCI can be offlined
5238 	 */
5239 	if (ph->ph_unstable) {
5240 		unstable = 1;
5241 	}
5242 
5243 	pip = ph->ph_path_head;
5244 	while (pip != NULL) {
5245 		MDI_PI_LOCK(pip);
5246 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5247 
5248 		/*
5249 		 * if failover in progress fail the pHCI from offlining
5250 		 */
5251 		ct = MDI_PI(pip)->pi_client;
5252 		i_mdi_client_lock(ct, pip);
5253 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5254 		    (ct->ct_unstable)) {
5255 			unstable = 1;
5256 		}
5257 		MDI_PI_UNLOCK(pip);
5258 
5259 		/*
5260 		 * Check to see of we are removing the last path of this
5261 		 * client device...
5262 		 */
5263 		cdip = ct->ct_dip;
5264 		if (!phci_only && cdip &&
5265 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5266 		    (i_mdi_client_compute_state(ct, ph) ==
5267 		    MDI_CLIENT_STATE_FAILED)) {
5268 			i_mdi_client_unlock(ct);
5269 			MDI_PHCI_UNLOCK(ph);
5270 			/*
5271 			 * We don't retire clients we just retire the
5272 			 * path to a client. If it is the last path
5273 			 * to a client, constraints are checked and
5274 			 * if we pass the last path is offlined. MPXIO will
5275 			 * then fail all I/Os to the client. Since we don't
5276 			 * want to retire the client on a path error
5277 			 * set constraint = 0 so that the client dip
5278 			 * is not retired.
5279 			 */
5280 			constraint = 0;
5281 			(void) e_ddi_retire_finalize(cdip, &constraint);
5282 			MDI_PHCI_LOCK(ph);
5283 			pip = next;
5284 		} else {
5285 			i_mdi_client_unlock(ct);
5286 			pip = next;
5287 		}
5288 	}
5289 
5290 	/*
5291 	 * Cannot offline pip(s)
5292 	 */
5293 	if (unstable) {
5294 		cmn_err(CE_WARN, "PHCI in transient state, cannot "
5295 		    "retire, dip = %p", (void *)dip);
5296 		MDI_PHCI_UNLOCK(ph);
5297 		return;
5298 	}
5299 
5300 	/*
5301 	 * Mark the pHCI as offline
5302 	 */
5303 	MDI_PHCI_SET_OFFLINE(ph);
5304 
5305 	/*
5306 	 * Mark the child mdi_pathinfo nodes as transient
5307 	 */
5308 	pip = ph->ph_path_head;
5309 	while (pip != NULL) {
5310 		MDI_PI_LOCK(pip);
5311 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5312 		MDI_PI_SET_OFFLINING(pip);
5313 		MDI_PI_UNLOCK(pip);
5314 		pip = next;
5315 	}
5316 	MDI_PHCI_UNLOCK(ph);
5317 	/*
5318 	 * Give a chance for any pending commands to execute
5319 	 */
5320 	delay(1);
5321 	MDI_PHCI_LOCK(ph);
5322 	pip = ph->ph_path_head;
5323 	while (pip != NULL) {
5324 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5325 		(void) i_mdi_pi_offline(pip, 0);
5326 		MDI_PI_LOCK(pip);
5327 		ct = MDI_PI(pip)->pi_client;
5328 		if (!MDI_PI_IS_OFFLINE(pip)) {
5329 			cmn_err(CE_WARN, "PHCI busy, cannot offline path: "
5330 			    "PHCI dip = %p", (void *)dip);
5331 			MDI_PI_UNLOCK(pip);
5332 			MDI_PHCI_SET_ONLINE(ph);
5333 			MDI_PHCI_UNLOCK(ph);
5334 			return;
5335 		}
5336 		MDI_PI_UNLOCK(pip);
5337 		pip = next;
5338 	}
5339 	MDI_PHCI_UNLOCK(ph);
5340 
5341 	return;
5342 }
5343 
5344 void
5345 mdi_phci_unretire(dev_info_t *dip)
5346 {
5347 	ASSERT(MDI_PHCI(dip));
5348 
5349 	/*
5350 	 * Online the phci
5351 	 */
5352 	i_mdi_phci_online(dip);
5353 }
5354 
5355 /*ARGSUSED*/
5356 static int
5357 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5358 {
5359 	int		rv = NDI_SUCCESS;
5360 	mdi_client_t	*ct;
5361 
5362 	/*
5363 	 * Client component to go offline.  Make sure that we are
5364 	 * not in failing over state and update client state
5365 	 * accordingly
5366 	 */
5367 	ct = i_devi_get_client(dip);
5368 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p %p\n",
5369 	    (void *)dip, (void *)ct));
5370 	if (ct != NULL) {
5371 		MDI_CLIENT_LOCK(ct);
5372 		if (ct->ct_unstable) {
5373 			/*
5374 			 * One or more paths are in transient state,
5375 			 * Dont allow offline of a client device
5376 			 */
5377 			MDI_DEBUG(1, (CE_WARN, dip,
5378 			    "!One or more paths to this device is "
5379 			    "in transient state. This device can not "
5380 			    "be removed at this moment. "
5381 			    "Please try again later."));
5382 			MDI_CLIENT_UNLOCK(ct);
5383 			return (NDI_BUSY);
5384 		}
5385 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5386 			/*
5387 			 * Failover is in progress, Dont allow DR of
5388 			 * a client device
5389 			 */
5390 			MDI_DEBUG(1, (CE_WARN, dip,
5391 			    "!Client device (%s%d) is Busy. %s",
5392 			    ddi_driver_name(dip), ddi_get_instance(dip),
5393 			    "This device can not be removed at "
5394 			    "this moment. Please try again later."));
5395 			MDI_CLIENT_UNLOCK(ct);
5396 			return (NDI_BUSY);
5397 		}
5398 		MDI_CLIENT_SET_OFFLINE(ct);
5399 
5400 		/*
5401 		 * Unbind our relationship with the dev_info node
5402 		 */
5403 		if (flags & NDI_DEVI_REMOVE) {
5404 			ct->ct_dip = NULL;
5405 		}
5406 		MDI_CLIENT_UNLOCK(ct);
5407 	}
5408 	return (rv);
5409 }
5410 
5411 /*
5412  * mdi_pre_attach():
5413  *		Pre attach() notification handler
5414  */
5415 /*ARGSUSED*/
5416 int
5417 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5418 {
5419 	/* don't support old DDI_PM_RESUME */
5420 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5421 	    (cmd == DDI_PM_RESUME))
5422 		return (DDI_FAILURE);
5423 
5424 	return (DDI_SUCCESS);
5425 }
5426 
5427 /*
5428  * mdi_post_attach():
5429  *		Post attach() notification handler
5430  */
5431 /*ARGSUSED*/
5432 void
5433 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5434 {
5435 	mdi_phci_t	*ph;
5436 	mdi_client_t	*ct;
5437 	mdi_vhci_t	*vh;
5438 
5439 	if (MDI_PHCI(dip)) {
5440 		ph = i_devi_get_phci(dip);
5441 		ASSERT(ph != NULL);
5442 
5443 		MDI_PHCI_LOCK(ph);
5444 		switch (cmd) {
5445 		case DDI_ATTACH:
5446 			MDI_DEBUG(2, (CE_NOTE, dip,
5447 			    "!pHCI post_attach: called %p\n", (void *)ph));
5448 			if (error == DDI_SUCCESS) {
5449 				MDI_PHCI_SET_ATTACH(ph);
5450 			} else {
5451 				MDI_DEBUG(1, (CE_NOTE, dip,
5452 				    "!pHCI post_attach: failed error=%d\n",
5453 				    error));
5454 				MDI_PHCI_SET_DETACH(ph);
5455 			}
5456 			break;
5457 
5458 		case DDI_RESUME:
5459 			MDI_DEBUG(2, (CE_NOTE, dip,
5460 			    "!pHCI post_resume: called %p\n", (void *)ph));
5461 			if (error == DDI_SUCCESS) {
5462 				MDI_PHCI_SET_RESUME(ph);
5463 			} else {
5464 				MDI_DEBUG(1, (CE_NOTE, dip,
5465 				    "!pHCI post_resume: failed error=%d\n",
5466 				    error));
5467 				MDI_PHCI_SET_SUSPEND(ph);
5468 			}
5469 			break;
5470 		}
5471 		MDI_PHCI_UNLOCK(ph);
5472 	}
5473 
5474 	if (MDI_CLIENT(dip)) {
5475 		ct = i_devi_get_client(dip);
5476 		ASSERT(ct != NULL);
5477 
5478 		MDI_CLIENT_LOCK(ct);
5479 		switch (cmd) {
5480 		case DDI_ATTACH:
5481 			MDI_DEBUG(2, (CE_NOTE, dip,
5482 			    "!Client post_attach: called %p\n", (void *)ct));
5483 			if (error != DDI_SUCCESS) {
5484 				MDI_DEBUG(1, (CE_NOTE, dip,
5485 				    "!Client post_attach: failed error=%d\n",
5486 				    error));
5487 				MDI_CLIENT_SET_DETACH(ct);
5488 				MDI_DEBUG(4, (CE_WARN, dip,
5489 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
5490 				i_mdi_pm_reset_client(ct);
5491 				break;
5492 			}
5493 
5494 			/*
5495 			 * Client device has successfully attached, inform
5496 			 * the vhci.
5497 			 */
5498 			vh = ct->ct_vhci;
5499 			if (vh->vh_ops->vo_client_attached)
5500 				(*vh->vh_ops->vo_client_attached)(dip);
5501 
5502 			MDI_CLIENT_SET_ATTACH(ct);
5503 			break;
5504 
5505 		case DDI_RESUME:
5506 			MDI_DEBUG(2, (CE_NOTE, dip,
5507 			    "!Client post_attach: called %p\n", (void *)ct));
5508 			if (error == DDI_SUCCESS) {
5509 				MDI_CLIENT_SET_RESUME(ct);
5510 			} else {
5511 				MDI_DEBUG(1, (CE_NOTE, dip,
5512 				    "!Client post_resume: failed error=%d\n",
5513 				    error));
5514 				MDI_CLIENT_SET_SUSPEND(ct);
5515 			}
5516 			break;
5517 		}
5518 		MDI_CLIENT_UNLOCK(ct);
5519 	}
5520 }
5521 
5522 /*
5523  * mdi_pre_detach():
5524  *		Pre detach notification handler
5525  */
5526 /*ARGSUSED*/
5527 int
5528 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5529 {
5530 	int rv = DDI_SUCCESS;
5531 
5532 	if (MDI_CLIENT(dip)) {
5533 		(void) i_mdi_client_pre_detach(dip, cmd);
5534 	}
5535 
5536 	if (MDI_PHCI(dip)) {
5537 		rv = i_mdi_phci_pre_detach(dip, cmd);
5538 	}
5539 
5540 	return (rv);
5541 }
5542 
5543 /*ARGSUSED*/
5544 static int
5545 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5546 {
5547 	int		rv = DDI_SUCCESS;
5548 	mdi_phci_t	*ph;
5549 	mdi_client_t	*ct;
5550 	mdi_pathinfo_t	*pip;
5551 	mdi_pathinfo_t	*failed_pip = NULL;
5552 	mdi_pathinfo_t	*next;
5553 
5554 	ph = i_devi_get_phci(dip);
5555 	if (ph == NULL) {
5556 		return (rv);
5557 	}
5558 
5559 	MDI_PHCI_LOCK(ph);
5560 	switch (cmd) {
5561 	case DDI_DETACH:
5562 		MDI_DEBUG(2, (CE_NOTE, dip,
5563 		    "!pHCI pre_detach: called %p\n", (void *)ph));
5564 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5565 			/*
5566 			 * mdi_pathinfo nodes are still attached to
5567 			 * this pHCI. Fail the detach for this pHCI.
5568 			 */
5569 			MDI_DEBUG(2, (CE_WARN, dip,
5570 			    "!pHCI pre_detach: "
5571 			    "mdi_pathinfo nodes are still attached "
5572 			    "%p\n", (void *)ph));
5573 			rv = DDI_FAILURE;
5574 			break;
5575 		}
5576 		MDI_PHCI_SET_DETACH(ph);
5577 		break;
5578 
5579 	case DDI_SUSPEND:
5580 		/*
5581 		 * pHCI is getting suspended.  Since mpxio client
5582 		 * devices may not be suspended at this point, to avoid
5583 		 * a potential stack overflow, it is important to suspend
5584 		 * client devices before pHCI can be suspended.
5585 		 */
5586 
5587 		MDI_DEBUG(2, (CE_NOTE, dip,
5588 		    "!pHCI pre_suspend: called %p\n", (void *)ph));
5589 		/*
5590 		 * Suspend all the client devices accessible through this pHCI
5591 		 */
5592 		pip = ph->ph_path_head;
5593 		while (pip != NULL && rv == DDI_SUCCESS) {
5594 			dev_info_t *cdip;
5595 			MDI_PI_LOCK(pip);
5596 			next =
5597 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5598 			ct = MDI_PI(pip)->pi_client;
5599 			i_mdi_client_lock(ct, pip);
5600 			cdip = ct->ct_dip;
5601 			MDI_PI_UNLOCK(pip);
5602 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5603 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5604 				i_mdi_client_unlock(ct);
5605 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5606 				    DDI_SUCCESS) {
5607 					/*
5608 					 * Suspend of one of the client
5609 					 * device has failed.
5610 					 */
5611 					MDI_DEBUG(1, (CE_WARN, dip,
5612 					    "!Suspend of device (%s%d) failed.",
5613 					    ddi_driver_name(cdip),
5614 					    ddi_get_instance(cdip)));
5615 					failed_pip = pip;
5616 					break;
5617 				}
5618 			} else {
5619 				i_mdi_client_unlock(ct);
5620 			}
5621 			pip = next;
5622 		}
5623 
5624 		if (rv == DDI_SUCCESS) {
5625 			/*
5626 			 * Suspend of client devices is complete. Proceed
5627 			 * with pHCI suspend.
5628 			 */
5629 			MDI_PHCI_SET_SUSPEND(ph);
5630 		} else {
5631 			/*
5632 			 * Revert back all the suspended client device states
5633 			 * to converse.
5634 			 */
5635 			pip = ph->ph_path_head;
5636 			while (pip != failed_pip) {
5637 				dev_info_t *cdip;
5638 				MDI_PI_LOCK(pip);
5639 				next =
5640 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5641 				ct = MDI_PI(pip)->pi_client;
5642 				i_mdi_client_lock(ct, pip);
5643 				cdip = ct->ct_dip;
5644 				MDI_PI_UNLOCK(pip);
5645 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5646 					i_mdi_client_unlock(ct);
5647 					(void) devi_attach(cdip, DDI_RESUME);
5648 				} else {
5649 					i_mdi_client_unlock(ct);
5650 				}
5651 				pip = next;
5652 			}
5653 		}
5654 		break;
5655 
5656 	default:
5657 		rv = DDI_FAILURE;
5658 		break;
5659 	}
5660 	MDI_PHCI_UNLOCK(ph);
5661 	return (rv);
5662 }
5663 
5664 /*ARGSUSED*/
5665 static int
5666 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5667 {
5668 	int		rv = DDI_SUCCESS;
5669 	mdi_client_t	*ct;
5670 
5671 	ct = i_devi_get_client(dip);
5672 	if (ct == NULL) {
5673 		return (rv);
5674 	}
5675 
5676 	MDI_CLIENT_LOCK(ct);
5677 	switch (cmd) {
5678 	case DDI_DETACH:
5679 		MDI_DEBUG(2, (CE_NOTE, dip,
5680 		    "!Client pre_detach: called %p\n", (void *)ct));
5681 		MDI_CLIENT_SET_DETACH(ct);
5682 		break;
5683 
5684 	case DDI_SUSPEND:
5685 		MDI_DEBUG(2, (CE_NOTE, dip,
5686 		    "!Client pre_suspend: called %p\n", (void *)ct));
5687 		MDI_CLIENT_SET_SUSPEND(ct);
5688 		break;
5689 
5690 	default:
5691 		rv = DDI_FAILURE;
5692 		break;
5693 	}
5694 	MDI_CLIENT_UNLOCK(ct);
5695 	return (rv);
5696 }
5697 
5698 /*
5699  * mdi_post_detach():
5700  *		Post detach notification handler
5701  */
5702 /*ARGSUSED*/
5703 void
5704 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5705 {
5706 	/*
5707 	 * Detach/Suspend of mpxio component failed. Update our state
5708 	 * too
5709 	 */
5710 	if (MDI_PHCI(dip))
5711 		i_mdi_phci_post_detach(dip, cmd, error);
5712 
5713 	if (MDI_CLIENT(dip))
5714 		i_mdi_client_post_detach(dip, cmd, error);
5715 }
5716 
5717 /*ARGSUSED*/
5718 static void
5719 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5720 {
5721 	mdi_phci_t	*ph;
5722 
5723 	/*
5724 	 * Detach/Suspend of phci component failed. Update our state
5725 	 * too
5726 	 */
5727 	ph = i_devi_get_phci(dip);
5728 	if (ph == NULL) {
5729 		return;
5730 	}
5731 
5732 	MDI_PHCI_LOCK(ph);
5733 	/*
5734 	 * Detach of pHCI failed. Restore back converse
5735 	 * state
5736 	 */
5737 	switch (cmd) {
5738 	case DDI_DETACH:
5739 		MDI_DEBUG(2, (CE_NOTE, dip,
5740 		    "!pHCI post_detach: called %p\n", (void *)ph));
5741 		if (error != DDI_SUCCESS)
5742 			MDI_PHCI_SET_ATTACH(ph);
5743 		break;
5744 
5745 	case DDI_SUSPEND:
5746 		MDI_DEBUG(2, (CE_NOTE, dip,
5747 		    "!pHCI post_suspend: called %p\n", (void *)ph));
5748 		if (error != DDI_SUCCESS)
5749 			MDI_PHCI_SET_RESUME(ph);
5750 		break;
5751 	}
5752 	MDI_PHCI_UNLOCK(ph);
5753 }
5754 
5755 /*ARGSUSED*/
5756 static void
5757 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5758 {
5759 	mdi_client_t	*ct;
5760 
5761 	ct = i_devi_get_client(dip);
5762 	if (ct == NULL) {
5763 		return;
5764 	}
5765 	MDI_CLIENT_LOCK(ct);
5766 	/*
5767 	 * Detach of Client failed. Restore back converse
5768 	 * state
5769 	 */
5770 	switch (cmd) {
5771 	case DDI_DETACH:
5772 		MDI_DEBUG(2, (CE_NOTE, dip,
5773 		    "!Client post_detach: called %p\n", (void *)ct));
5774 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5775 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5776 			    "i_mdi_pm_rele_client\n"));
5777 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5778 		} else {
5779 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5780 			    "i_mdi_pm_reset_client\n"));
5781 			i_mdi_pm_reset_client(ct);
5782 		}
5783 		if (error != DDI_SUCCESS)
5784 			MDI_CLIENT_SET_ATTACH(ct);
5785 		break;
5786 
5787 	case DDI_SUSPEND:
5788 		MDI_DEBUG(2, (CE_NOTE, dip,
5789 		    "!Client post_suspend: called %p\n", (void *)ct));
5790 		if (error != DDI_SUCCESS)
5791 			MDI_CLIENT_SET_RESUME(ct);
5792 		break;
5793 	}
5794 	MDI_CLIENT_UNLOCK(ct);
5795 }
5796 
5797 int
5798 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5799 {
5800 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5801 }
5802 
5803 /*
5804  * create and install per-path (client - pHCI) statistics
5805  * I/O stats supported: nread, nwritten, reads, and writes
5806  * Error stats - hard errors, soft errors, & transport errors
5807  */
5808 int
5809 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5810 {
5811 	kstat_t			*kiosp, *kerrsp;
5812 	struct pi_errs		*nsp;
5813 	struct mdi_pi_kstats	*mdi_statp;
5814 
5815 	if (MDI_PI(pip)->pi_kstats != NULL)
5816 		return (MDI_SUCCESS);
5817 
5818 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5819 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
5820 		return (MDI_FAILURE);
5821 	}
5822 
5823 	(void) strcat(ksname, ",err");
5824 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5825 	    KSTAT_TYPE_NAMED,
5826 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5827 	if (kerrsp == NULL) {
5828 		kstat_delete(kiosp);
5829 		return (MDI_FAILURE);
5830 	}
5831 
5832 	nsp = (struct pi_errs *)kerrsp->ks_data;
5833 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5834 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5835 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5836 	    KSTAT_DATA_UINT32);
5837 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5838 	    KSTAT_DATA_UINT32);
5839 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5840 	    KSTAT_DATA_UINT32);
5841 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5842 	    KSTAT_DATA_UINT32);
5843 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5844 	    KSTAT_DATA_UINT32);
5845 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5846 	    KSTAT_DATA_UINT32);
5847 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5848 	    KSTAT_DATA_UINT32);
5849 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5850 
5851 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5852 	mdi_statp->pi_kstat_ref = 1;
5853 	mdi_statp->pi_kstat_iostats = kiosp;
5854 	mdi_statp->pi_kstat_errstats = kerrsp;
5855 	kstat_install(kiosp);
5856 	kstat_install(kerrsp);
5857 	MDI_PI(pip)->pi_kstats = mdi_statp;
5858 	return (MDI_SUCCESS);
5859 }
5860 
5861 /*
5862  * destroy per-path properties
5863  */
5864 static void
5865 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5866 {
5867 
5868 	struct mdi_pi_kstats *mdi_statp;
5869 
5870 	if (MDI_PI(pip)->pi_kstats == NULL)
5871 		return;
5872 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5873 		return;
5874 
5875 	MDI_PI(pip)->pi_kstats = NULL;
5876 
5877 	/*
5878 	 * the kstat may be shared between multiple pathinfo nodes
5879 	 * decrement this pathinfo's usage, removing the kstats
5880 	 * themselves when the last pathinfo reference is removed.
5881 	 */
5882 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5883 	if (--mdi_statp->pi_kstat_ref != 0)
5884 		return;
5885 
5886 	kstat_delete(mdi_statp->pi_kstat_iostats);
5887 	kstat_delete(mdi_statp->pi_kstat_errstats);
5888 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5889 }
5890 
5891 /*
5892  * update I/O paths KSTATS
5893  */
5894 void
5895 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5896 {
5897 	kstat_t *iostatp;
5898 	size_t xfer_cnt;
5899 
5900 	ASSERT(pip != NULL);
5901 
5902 	/*
5903 	 * I/O can be driven across a path prior to having path
5904 	 * statistics available, i.e. probe(9e).
5905 	 */
5906 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5907 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5908 		xfer_cnt = bp->b_bcount - bp->b_resid;
5909 		if (bp->b_flags & B_READ) {
5910 			KSTAT_IO_PTR(iostatp)->reads++;
5911 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5912 		} else {
5913 			KSTAT_IO_PTR(iostatp)->writes++;
5914 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5915 		}
5916 	}
5917 }
5918 
5919 /*
5920  * Enable the path(specific client/target/initiator)
5921  * Enabling a path means that MPxIO may select the enabled path for routing
5922  * future I/O requests, subject to other path state constraints.
5923  */
5924 int
5925 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
5926 {
5927 	mdi_phci_t	*ph;
5928 
5929 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5930 	if (ph == NULL) {
5931 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5932 			" failed. pip: %p ph = NULL\n", (void *)pip));
5933 		return (MDI_FAILURE);
5934 	}
5935 
5936 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
5937 		MDI_ENABLE_OP);
5938 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5939 		" Returning success pip = %p. ph = %p\n",
5940 		(void *)pip, (void *)ph));
5941 	return (MDI_SUCCESS);
5942 
5943 }
5944 
5945 /*
5946  * Disable the path (specific client/target/initiator)
5947  * Disabling a path means that MPxIO will not select the disabled path for
5948  * routing any new I/O requests.
5949  */
5950 int
5951 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
5952 {
5953 	mdi_phci_t	*ph;
5954 
5955 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5956 	if (ph == NULL) {
5957 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5958 			" failed. pip: %p ph = NULL\n", (void *)pip));
5959 		return (MDI_FAILURE);
5960 	}
5961 
5962 	(void) i_mdi_enable_disable_path(pip,
5963 			ph->ph_vhci, flags, MDI_DISABLE_OP);
5964 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5965 		"Returning success pip = %p. ph = %p",
5966 		(void *)pip, (void *)ph));
5967 	return (MDI_SUCCESS);
5968 }
5969 
5970 /*
5971  * disable the path to a particular pHCI (pHCI specified in the phci_path
5972  * argument) for a particular client (specified in the client_path argument).
5973  * Disabling a path means that MPxIO will not select the disabled path for
5974  * routing any new I/O requests.
5975  * NOTE: this will be removed once the NWS files are changed to use the new
5976  * mdi_{enable,disable}_path interfaces
5977  */
5978 int
5979 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5980 {
5981 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5982 }
5983 
5984 /*
5985  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5986  * argument) for a particular client (specified in the client_path argument).
5987  * Enabling a path means that MPxIO may select the enabled path for routing
5988  * future I/O requests, subject to other path state constraints.
5989  * NOTE: this will be removed once the NWS files are changed to use the new
5990  * mdi_{enable,disable}_path interfaces
5991  */
5992 
5993 int
5994 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5995 {
5996 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5997 }
5998 
5999 /*
6000  * Common routine for doing enable/disable.
6001  */
6002 static mdi_pathinfo_t *
6003 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6004 		int op)
6005 {
6006 	int		sync_flag = 0;
6007 	int		rv;
6008 	mdi_pathinfo_t 	*next;
6009 	int		(*f)() = NULL;
6010 
6011 	f = vh->vh_ops->vo_pi_state_change;
6012 
6013 	sync_flag = (flags << 8) & 0xf00;
6014 
6015 	/*
6016 	 * Do a callback into the mdi consumer to let it
6017 	 * know that path is about to get enabled/disabled.
6018 	 */
6019 	if (f != NULL) {
6020 		rv = (*f)(vh->vh_dip, pip, 0,
6021 			MDI_PI_EXT_STATE(pip),
6022 			MDI_EXT_STATE_CHANGE | sync_flag |
6023 			op | MDI_BEFORE_STATE_CHANGE);
6024 		if (rv != MDI_SUCCESS) {
6025 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
6026 			"!vo_pi_state_change: failed rv = %x", rv));
6027 		}
6028 	}
6029 	MDI_PI_LOCK(pip);
6030 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6031 
6032 	switch (flags) {
6033 		case USER_DISABLE:
6034 			if (op == MDI_DISABLE_OP) {
6035 				MDI_PI_SET_USER_DISABLE(pip);
6036 			} else {
6037 				MDI_PI_SET_USER_ENABLE(pip);
6038 			}
6039 			break;
6040 		case DRIVER_DISABLE:
6041 			if (op == MDI_DISABLE_OP) {
6042 				MDI_PI_SET_DRV_DISABLE(pip);
6043 			} else {
6044 				MDI_PI_SET_DRV_ENABLE(pip);
6045 			}
6046 			break;
6047 		case DRIVER_DISABLE_TRANSIENT:
6048 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6049 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6050 			} else {
6051 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6052 			}
6053 			break;
6054 	}
6055 	MDI_PI_UNLOCK(pip);
6056 	/*
6057 	 * Do a callback into the mdi consumer to let it
6058 	 * know that path is now enabled/disabled.
6059 	 */
6060 	if (f != NULL) {
6061 		rv = (*f)(vh->vh_dip, pip, 0,
6062 			MDI_PI_EXT_STATE(pip),
6063 			MDI_EXT_STATE_CHANGE | sync_flag |
6064 			op | MDI_AFTER_STATE_CHANGE);
6065 		if (rv != MDI_SUCCESS) {
6066 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
6067 			"!vo_pi_state_change: failed rv = %x", rv));
6068 		}
6069 	}
6070 	return (next);
6071 }
6072 
6073 /*
6074  * Common routine for doing enable/disable.
6075  * NOTE: this will be removed once the NWS files are changed to use the new
6076  * mdi_{enable,disable}_path has been putback
6077  */
6078 int
6079 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6080 {
6081 
6082 	mdi_phci_t	*ph;
6083 	mdi_vhci_t	*vh = NULL;
6084 	mdi_client_t	*ct;
6085 	mdi_pathinfo_t	*next, *pip;
6086 	int		found_it;
6087 
6088 	ph = i_devi_get_phci(pdip);
6089 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6090 		"Op = %d pdip = %p cdip = %p\n", op, (void *)pdip,
6091 		(void *)cdip));
6092 	if (ph == NULL) {
6093 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
6094 			"Op %d failed. ph = NULL\n", op));
6095 		return (MDI_FAILURE);
6096 	}
6097 
6098 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6099 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6100 			"Op Invalid operation = %d\n", op));
6101 		return (MDI_FAILURE);
6102 	}
6103 
6104 	vh = ph->ph_vhci;
6105 
6106 	if (cdip == NULL) {
6107 		/*
6108 		 * Need to mark the Phci as enabled/disabled.
6109 		 */
6110 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6111 		"Op %d for the phci\n", op));
6112 		MDI_PHCI_LOCK(ph);
6113 		switch (flags) {
6114 			case USER_DISABLE:
6115 				if (op == MDI_DISABLE_OP) {
6116 					MDI_PHCI_SET_USER_DISABLE(ph);
6117 				} else {
6118 					MDI_PHCI_SET_USER_ENABLE(ph);
6119 				}
6120 				break;
6121 			case DRIVER_DISABLE:
6122 				if (op == MDI_DISABLE_OP) {
6123 					MDI_PHCI_SET_DRV_DISABLE(ph);
6124 				} else {
6125 					MDI_PHCI_SET_DRV_ENABLE(ph);
6126 				}
6127 				break;
6128 			case DRIVER_DISABLE_TRANSIENT:
6129 				if (op == MDI_DISABLE_OP) {
6130 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6131 				} else {
6132 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6133 				}
6134 				break;
6135 			default:
6136 				MDI_PHCI_UNLOCK(ph);
6137 				MDI_DEBUG(1, (CE_NOTE, NULL,
6138 				"!i_mdi_pi_enable_disable:"
6139 				" Invalid flag argument= %d\n", flags));
6140 		}
6141 
6142 		/*
6143 		 * Phci has been disabled. Now try to enable/disable
6144 		 * path info's to each client.
6145 		 */
6146 		pip = ph->ph_path_head;
6147 		while (pip != NULL) {
6148 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6149 		}
6150 		MDI_PHCI_UNLOCK(ph);
6151 	} else {
6152 
6153 		/*
6154 		 * Disable a specific client.
6155 		 */
6156 		ct = i_devi_get_client(cdip);
6157 		if (ct == NULL) {
6158 			MDI_DEBUG(1, (CE_NOTE, NULL,
6159 			"!i_mdi_pi_enable_disable:"
6160 			" failed. ct = NULL operation = %d\n", op));
6161 			return (MDI_FAILURE);
6162 		}
6163 
6164 		MDI_CLIENT_LOCK(ct);
6165 		pip = ct->ct_path_head;
6166 		found_it = 0;
6167 		while (pip != NULL) {
6168 			MDI_PI_LOCK(pip);
6169 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6170 			if (MDI_PI(pip)->pi_phci == ph) {
6171 				MDI_PI_UNLOCK(pip);
6172 				found_it = 1;
6173 				break;
6174 			}
6175 			MDI_PI_UNLOCK(pip);
6176 			pip = next;
6177 		}
6178 
6179 
6180 		MDI_CLIENT_UNLOCK(ct);
6181 		if (found_it == 0) {
6182 			MDI_DEBUG(1, (CE_NOTE, NULL,
6183 			"!i_mdi_pi_enable_disable:"
6184 			" failed. Could not find corresponding pip\n"));
6185 			return (MDI_FAILURE);
6186 		}
6187 
6188 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6189 	}
6190 
6191 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6192 		"Op %d Returning success pdip = %p cdip = %p\n",
6193 		op, (void *)pdip, (void *)cdip));
6194 	return (MDI_SUCCESS);
6195 }
6196 
6197 /*
6198  * Ensure phci powered up
6199  */
6200 static void
6201 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6202 {
6203 	dev_info_t	*ph_dip;
6204 
6205 	ASSERT(pip != NULL);
6206 	ASSERT(MDI_PI_LOCKED(pip));
6207 
6208 	if (MDI_PI(pip)->pi_pm_held) {
6209 		return;
6210 	}
6211 
6212 	ph_dip = mdi_pi_get_phci(pip);
6213 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d %p\n",
6214 	    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
6215 	if (ph_dip == NULL) {
6216 		return;
6217 	}
6218 
6219 	MDI_PI_UNLOCK(pip);
6220 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
6221 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6222 
6223 	pm_hold_power(ph_dip);
6224 
6225 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
6226 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6227 	MDI_PI_LOCK(pip);
6228 
6229 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6230 	if (DEVI(ph_dip)->devi_pm_info)
6231 		MDI_PI(pip)->pi_pm_held = 1;
6232 }
6233 
6234 /*
6235  * Allow phci powered down
6236  */
6237 static void
6238 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6239 {
6240 	dev_info_t	*ph_dip = NULL;
6241 
6242 	ASSERT(pip != NULL);
6243 	ASSERT(MDI_PI_LOCKED(pip));
6244 
6245 	if (MDI_PI(pip)->pi_pm_held == 0) {
6246 		return;
6247 	}
6248 
6249 	ph_dip = mdi_pi_get_phci(pip);
6250 	ASSERT(ph_dip != NULL);
6251 
6252 	MDI_PI_UNLOCK(pip);
6253 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d %p\n",
6254 	    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
6255 
6256 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
6257 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6258 	pm_rele_power(ph_dip);
6259 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
6260 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6261 
6262 	MDI_PI_LOCK(pip);
6263 	MDI_PI(pip)->pi_pm_held = 0;
6264 }
6265 
6266 static void
6267 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6268 {
6269 	ASSERT(MDI_CLIENT_LOCKED(ct));
6270 
6271 	ct->ct_power_cnt += incr;
6272 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client %p "
6273 	    "ct_power_cnt = %d incr = %d\n", (void *)ct,
6274 	    ct->ct_power_cnt, incr));
6275 	ASSERT(ct->ct_power_cnt >= 0);
6276 }
6277 
6278 static void
6279 i_mdi_rele_all_phci(mdi_client_t *ct)
6280 {
6281 	mdi_pathinfo_t  *pip;
6282 
6283 	ASSERT(MDI_CLIENT_LOCKED(ct));
6284 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6285 	while (pip != NULL) {
6286 		mdi_hold_path(pip);
6287 		MDI_PI_LOCK(pip);
6288 		i_mdi_pm_rele_pip(pip);
6289 		MDI_PI_UNLOCK(pip);
6290 		mdi_rele_path(pip);
6291 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6292 	}
6293 }
6294 
6295 static void
6296 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6297 {
6298 	ASSERT(MDI_CLIENT_LOCKED(ct));
6299 
6300 	if (i_ddi_devi_attached(ct->ct_dip)) {
6301 		ct->ct_power_cnt -= decr;
6302 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client %p "
6303 		    "ct_power_cnt = %d decr = %d\n",
6304 		    (void *)ct, ct->ct_power_cnt, decr));
6305 	}
6306 
6307 	ASSERT(ct->ct_power_cnt >= 0);
6308 	if (ct->ct_power_cnt == 0) {
6309 		i_mdi_rele_all_phci(ct);
6310 		return;
6311 	}
6312 }
6313 
6314 static void
6315 i_mdi_pm_reset_client(mdi_client_t *ct)
6316 {
6317 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client %p "
6318 	    "ct_power_cnt = %d\n", (void *)ct, ct->ct_power_cnt));
6319 	ASSERT(MDI_CLIENT_LOCKED(ct));
6320 	ct->ct_power_cnt = 0;
6321 	i_mdi_rele_all_phci(ct);
6322 	ct->ct_powercnt_config = 0;
6323 	ct->ct_powercnt_unconfig = 0;
6324 	ct->ct_powercnt_reset = 1;
6325 }
6326 
6327 static int
6328 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6329 {
6330 	int		ret;
6331 	dev_info_t	*ph_dip;
6332 
6333 	MDI_PI_LOCK(pip);
6334 	i_mdi_pm_hold_pip(pip);
6335 
6336 	ph_dip = mdi_pi_get_phci(pip);
6337 	MDI_PI_UNLOCK(pip);
6338 
6339 	/* bring all components of phci to full power */
6340 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
6341 	    "pm_powerup for %s%d %p\n", ddi_driver_name(ph_dip),
6342 	    ddi_get_instance(ph_dip), (void *)pip));
6343 
6344 	ret = pm_powerup(ph_dip);
6345 
6346 	if (ret == DDI_FAILURE) {
6347 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
6348 		    "pm_powerup FAILED for %s%d %p\n",
6349 		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6350 		    (void *)pip));
6351 
6352 		MDI_PI_LOCK(pip);
6353 		i_mdi_pm_rele_pip(pip);
6354 		MDI_PI_UNLOCK(pip);
6355 		return (MDI_FAILURE);
6356 	}
6357 
6358 	return (MDI_SUCCESS);
6359 }
6360 
6361 static int
6362 i_mdi_power_all_phci(mdi_client_t *ct)
6363 {
6364 	mdi_pathinfo_t  *pip;
6365 	int		succeeded = 0;
6366 
6367 	ASSERT(MDI_CLIENT_LOCKED(ct));
6368 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6369 	while (pip != NULL) {
6370 		/*
6371 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6372 		 * or MDI_PATHINFO_STATE_OFFLINE.
6373 		 */
6374 		if (MDI_PI_IS_INIT(pip) ||
6375 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6376 			mdi_hold_path(pip);
6377 			MDI_CLIENT_UNLOCK(ct);
6378 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6379 				succeeded = 1;
6380 
6381 			ASSERT(ct == MDI_PI(pip)->pi_client);
6382 			MDI_CLIENT_LOCK(ct);
6383 			mdi_rele_path(pip);
6384 		}
6385 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6386 	}
6387 
6388 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6389 }
6390 
6391 /*
6392  * mdi_bus_power():
6393  *		1. Place the phci(s) into powered up state so that
6394  *		   client can do power management
6395  *		2. Ensure phci powered up as client power managing
6396  * Return Values:
6397  *		MDI_SUCCESS
6398  *		MDI_FAILURE
6399  */
6400 int
6401 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6402     void *arg, void *result)
6403 {
6404 	int			ret = MDI_SUCCESS;
6405 	pm_bp_child_pwrchg_t	*bpc;
6406 	mdi_client_t		*ct;
6407 	dev_info_t		*cdip;
6408 	pm_bp_has_changed_t	*bphc;
6409 
6410 	/*
6411 	 * BUS_POWER_NOINVOL not supported
6412 	 */
6413 	if (op == BUS_POWER_NOINVOL)
6414 		return (MDI_FAILURE);
6415 
6416 	/*
6417 	 * ignore other OPs.
6418 	 * return quickly to save cou cycles on the ct processing
6419 	 */
6420 	switch (op) {
6421 	case BUS_POWER_PRE_NOTIFICATION:
6422 	case BUS_POWER_POST_NOTIFICATION:
6423 		bpc = (pm_bp_child_pwrchg_t *)arg;
6424 		cdip = bpc->bpc_dip;
6425 		break;
6426 	case BUS_POWER_HAS_CHANGED:
6427 		bphc = (pm_bp_has_changed_t *)arg;
6428 		cdip = bphc->bphc_dip;
6429 		break;
6430 	default:
6431 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6432 	}
6433 
6434 	ASSERT(MDI_CLIENT(cdip));
6435 
6436 	ct = i_devi_get_client(cdip);
6437 	if (ct == NULL)
6438 		return (MDI_FAILURE);
6439 
6440 	/*
6441 	 * wait till the mdi_pathinfo node state change are processed
6442 	 */
6443 	MDI_CLIENT_LOCK(ct);
6444 	switch (op) {
6445 	case BUS_POWER_PRE_NOTIFICATION:
6446 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
6447 		    "BUS_POWER_PRE_NOTIFICATION:"
6448 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6449 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6450 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6451 
6452 		/* serialize power level change per client */
6453 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6454 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6455 
6456 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6457 
6458 		if (ct->ct_power_cnt == 0) {
6459 			ret = i_mdi_power_all_phci(ct);
6460 		}
6461 
6462 		/*
6463 		 * if new_level > 0:
6464 		 *	- hold phci(s)
6465 		 *	- power up phci(s) if not already
6466 		 * ignore power down
6467 		 */
6468 		if (bpc->bpc_nlevel > 0) {
6469 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6470 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6471 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
6472 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6473 			}
6474 		}
6475 		break;
6476 	case BUS_POWER_POST_NOTIFICATION:
6477 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
6478 		    "BUS_POWER_POST_NOTIFICATION:"
6479 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
6480 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6481 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6482 		    *(int *)result));
6483 
6484 		if (*(int *)result == DDI_SUCCESS) {
6485 			if (bpc->bpc_nlevel > 0) {
6486 				MDI_CLIENT_SET_POWER_UP(ct);
6487 			} else {
6488 				MDI_CLIENT_SET_POWER_DOWN(ct);
6489 			}
6490 		}
6491 
6492 		/* release the hold we did in pre-notification */
6493 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6494 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6495 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6496 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6497 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6498 		}
6499 
6500 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6501 			/* another thread might started attaching */
6502 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6503 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6504 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
6505 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6506 			/* detaching has been taken care in pm_post_unconfig */
6507 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6508 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6509 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
6510 				i_mdi_pm_reset_client(ct);
6511 			}
6512 		}
6513 
6514 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6515 		cv_broadcast(&ct->ct_powerchange_cv);
6516 
6517 		break;
6518 
6519 	/* need to do more */
6520 	case BUS_POWER_HAS_CHANGED:
6521 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
6522 		    "BUS_POWER_HAS_CHANGED:"
6523 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6524 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6525 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6526 
6527 		if (bphc->bphc_nlevel > 0 &&
6528 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6529 			if (ct->ct_power_cnt == 0) {
6530 				ret = i_mdi_power_all_phci(ct);
6531 			}
6532 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6533 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
6534 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6535 		}
6536 
6537 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6538 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6539 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6540 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6541 		}
6542 		break;
6543 	}
6544 
6545 	MDI_CLIENT_UNLOCK(ct);
6546 	return (ret);
6547 }
6548 
6549 static int
6550 i_mdi_pm_pre_config_one(dev_info_t *child)
6551 {
6552 	int		ret = MDI_SUCCESS;
6553 	mdi_client_t	*ct;
6554 
6555 	ct = i_devi_get_client(child);
6556 	if (ct == NULL)
6557 		return (MDI_FAILURE);
6558 
6559 	MDI_CLIENT_LOCK(ct);
6560 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6561 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6562 
6563 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6564 		MDI_CLIENT_UNLOCK(ct);
6565 		MDI_DEBUG(4, (CE_NOTE, child,
6566 		    "i_mdi_pm_pre_config_one already configured\n"));
6567 		return (MDI_SUCCESS);
6568 	}
6569 
6570 	if (ct->ct_powercnt_config) {
6571 		MDI_CLIENT_UNLOCK(ct);
6572 		MDI_DEBUG(4, (CE_NOTE, child,
6573 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
6574 		return (MDI_SUCCESS);
6575 	}
6576 
6577 	if (ct->ct_power_cnt == 0) {
6578 		ret = i_mdi_power_all_phci(ct);
6579 	}
6580 	MDI_DEBUG(4, (CE_NOTE, child,
6581 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
6582 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6583 	ct->ct_powercnt_config = 1;
6584 	ct->ct_powercnt_reset = 0;
6585 	MDI_CLIENT_UNLOCK(ct);
6586 	return (ret);
6587 }
6588 
6589 static int
6590 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6591 {
6592 	int			ret = MDI_SUCCESS;
6593 	dev_info_t		*cdip;
6594 	int			circ;
6595 
6596 	ASSERT(MDI_VHCI(vdip));
6597 
6598 	/* ndi_devi_config_one */
6599 	if (child) {
6600 		ASSERT(DEVI_BUSY_OWNED(vdip));
6601 		return (i_mdi_pm_pre_config_one(child));
6602 	}
6603 
6604 	/* devi_config_common */
6605 	ndi_devi_enter(vdip, &circ);
6606 	cdip = ddi_get_child(vdip);
6607 	while (cdip) {
6608 		dev_info_t *next = ddi_get_next_sibling(cdip);
6609 
6610 		ret = i_mdi_pm_pre_config_one(cdip);
6611 		if (ret != MDI_SUCCESS)
6612 			break;
6613 		cdip = next;
6614 	}
6615 	ndi_devi_exit(vdip, circ);
6616 	return (ret);
6617 }
6618 
6619 static int
6620 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6621 {
6622 	int		ret = MDI_SUCCESS;
6623 	mdi_client_t	*ct;
6624 
6625 	ct = i_devi_get_client(child);
6626 	if (ct == NULL)
6627 		return (MDI_FAILURE);
6628 
6629 	MDI_CLIENT_LOCK(ct);
6630 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6631 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6632 
6633 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6634 		MDI_DEBUG(4, (CE_NOTE, child,
6635 		    "i_mdi_pm_pre_unconfig node detached already\n"));
6636 		MDI_CLIENT_UNLOCK(ct);
6637 		return (MDI_SUCCESS);
6638 	}
6639 
6640 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6641 	    (flags & NDI_AUTODETACH)) {
6642 		MDI_DEBUG(4, (CE_NOTE, child,
6643 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
6644 		MDI_CLIENT_UNLOCK(ct);
6645 		return (MDI_FAILURE);
6646 	}
6647 
6648 	if (ct->ct_powercnt_unconfig) {
6649 		MDI_DEBUG(4, (CE_NOTE, child,
6650 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
6651 		MDI_CLIENT_UNLOCK(ct);
6652 		*held = 1;
6653 		return (MDI_SUCCESS);
6654 	}
6655 
6656 	if (ct->ct_power_cnt == 0) {
6657 		ret = i_mdi_power_all_phci(ct);
6658 	}
6659 	MDI_DEBUG(4, (CE_NOTE, child,
6660 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
6661 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6662 	ct->ct_powercnt_unconfig = 1;
6663 	ct->ct_powercnt_reset = 0;
6664 	MDI_CLIENT_UNLOCK(ct);
6665 	if (ret == MDI_SUCCESS)
6666 		*held = 1;
6667 	return (ret);
6668 }
6669 
6670 static int
6671 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6672     int flags)
6673 {
6674 	int			ret = MDI_SUCCESS;
6675 	dev_info_t		*cdip;
6676 	int			circ;
6677 
6678 	ASSERT(MDI_VHCI(vdip));
6679 	*held = 0;
6680 
6681 	/* ndi_devi_unconfig_one */
6682 	if (child) {
6683 		ASSERT(DEVI_BUSY_OWNED(vdip));
6684 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6685 	}
6686 
6687 	/* devi_unconfig_common */
6688 	ndi_devi_enter(vdip, &circ);
6689 	cdip = ddi_get_child(vdip);
6690 	while (cdip) {
6691 		dev_info_t *next = ddi_get_next_sibling(cdip);
6692 
6693 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6694 		cdip = next;
6695 	}
6696 	ndi_devi_exit(vdip, circ);
6697 
6698 	if (*held)
6699 		ret = MDI_SUCCESS;
6700 
6701 	return (ret);
6702 }
6703 
6704 static void
6705 i_mdi_pm_post_config_one(dev_info_t *child)
6706 {
6707 	mdi_client_t	*ct;
6708 
6709 	ct = i_devi_get_client(child);
6710 	if (ct == NULL)
6711 		return;
6712 
6713 	MDI_CLIENT_LOCK(ct);
6714 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6715 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6716 
6717 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6718 		MDI_DEBUG(4, (CE_NOTE, child,
6719 		    "i_mdi_pm_post_config_one NOT configured\n"));
6720 		MDI_CLIENT_UNLOCK(ct);
6721 		return;
6722 	}
6723 
6724 	/* client has not been updated */
6725 	if (MDI_CLIENT_IS_FAILED(ct)) {
6726 		MDI_DEBUG(4, (CE_NOTE, child,
6727 		    "i_mdi_pm_post_config_one NOT configured\n"));
6728 		MDI_CLIENT_UNLOCK(ct);
6729 		return;
6730 	}
6731 
6732 	/* another thread might have powered it down or detached it */
6733 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6734 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6735 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6736 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6737 		MDI_DEBUG(4, (CE_NOTE, child,
6738 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6739 		i_mdi_pm_reset_client(ct);
6740 	} else {
6741 		mdi_pathinfo_t  *pip, *next;
6742 		int	valid_path_count = 0;
6743 
6744 		MDI_DEBUG(4, (CE_NOTE, child,
6745 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6746 		pip = ct->ct_path_head;
6747 		while (pip != NULL) {
6748 			MDI_PI_LOCK(pip);
6749 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6750 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6751 				valid_path_count ++;
6752 			MDI_PI_UNLOCK(pip);
6753 			pip = next;
6754 		}
6755 		i_mdi_pm_rele_client(ct, valid_path_count);
6756 	}
6757 	ct->ct_powercnt_config = 0;
6758 	MDI_CLIENT_UNLOCK(ct);
6759 }
6760 
6761 static void
6762 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6763 {
6764 	int		circ;
6765 	dev_info_t	*cdip;
6766 
6767 	ASSERT(MDI_VHCI(vdip));
6768 
6769 	/* ndi_devi_config_one */
6770 	if (child) {
6771 		ASSERT(DEVI_BUSY_OWNED(vdip));
6772 		i_mdi_pm_post_config_one(child);
6773 		return;
6774 	}
6775 
6776 	/* devi_config_common */
6777 	ndi_devi_enter(vdip, &circ);
6778 	cdip = ddi_get_child(vdip);
6779 	while (cdip) {
6780 		dev_info_t *next = ddi_get_next_sibling(cdip);
6781 
6782 		i_mdi_pm_post_config_one(cdip);
6783 		cdip = next;
6784 	}
6785 	ndi_devi_exit(vdip, circ);
6786 }
6787 
6788 static void
6789 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6790 {
6791 	mdi_client_t	*ct;
6792 
6793 	ct = i_devi_get_client(child);
6794 	if (ct == NULL)
6795 		return;
6796 
6797 	MDI_CLIENT_LOCK(ct);
6798 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6799 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6800 
6801 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6802 		MDI_DEBUG(4, (CE_NOTE, child,
6803 		    "i_mdi_pm_post_unconfig NOT held\n"));
6804 		MDI_CLIENT_UNLOCK(ct);
6805 		return;
6806 	}
6807 
6808 	/* failure detaching or another thread just attached it */
6809 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6810 	    i_ddi_devi_attached(ct->ct_dip)) ||
6811 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6812 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6813 		MDI_DEBUG(4, (CE_NOTE, child,
6814 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6815 		i_mdi_pm_reset_client(ct);
6816 	} else {
6817 		mdi_pathinfo_t  *pip, *next;
6818 		int	valid_path_count = 0;
6819 
6820 		MDI_DEBUG(4, (CE_NOTE, child,
6821 		    "i_mdi_pm_post_unconfig i_mdi_pm_rele_client\n"));
6822 		pip = ct->ct_path_head;
6823 		while (pip != NULL) {
6824 			MDI_PI_LOCK(pip);
6825 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6826 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6827 				valid_path_count ++;
6828 			MDI_PI_UNLOCK(pip);
6829 			pip = next;
6830 		}
6831 		i_mdi_pm_rele_client(ct, valid_path_count);
6832 		ct->ct_powercnt_unconfig = 0;
6833 	}
6834 
6835 	MDI_CLIENT_UNLOCK(ct);
6836 }
6837 
6838 static void
6839 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
6840 {
6841 	int			circ;
6842 	dev_info_t		*cdip;
6843 
6844 	ASSERT(MDI_VHCI(vdip));
6845 
6846 	if (!held) {
6847 		MDI_DEBUG(4, (CE_NOTE, vdip,
6848 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6849 		return;
6850 	}
6851 
6852 	if (child) {
6853 		ASSERT(DEVI_BUSY_OWNED(vdip));
6854 		i_mdi_pm_post_unconfig_one(child);
6855 		return;
6856 	}
6857 
6858 	ndi_devi_enter(vdip, &circ);
6859 	cdip = ddi_get_child(vdip);
6860 	while (cdip) {
6861 		dev_info_t *next = ddi_get_next_sibling(cdip);
6862 
6863 		i_mdi_pm_post_unconfig_one(cdip);
6864 		cdip = next;
6865 	}
6866 	ndi_devi_exit(vdip, circ);
6867 }
6868 
6869 int
6870 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6871 {
6872 	int			circ, ret = MDI_SUCCESS;
6873 	dev_info_t		*client_dip = NULL;
6874 	mdi_client_t		*ct;
6875 
6876 	/*
6877 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6878 	 * Power up pHCI for the named client device.
6879 	 * Note: Before the client is enumerated under vhci by phci,
6880 	 * client_dip can be NULL. Then proceed to power up all the
6881 	 * pHCIs.
6882 	 */
6883 	if (devnm != NULL) {
6884 		ndi_devi_enter(vdip, &circ);
6885 		client_dip = ndi_devi_findchild(vdip, devnm);
6886 	}
6887 
6888 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d %s %p\n",
6889 	    op, devnm ? devnm : "NULL", (void *)client_dip));
6890 
6891 	switch (op) {
6892 	case MDI_PM_PRE_CONFIG:
6893 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6894 		break;
6895 
6896 	case MDI_PM_PRE_UNCONFIG:
6897 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6898 		    flags);
6899 		break;
6900 
6901 	case MDI_PM_POST_CONFIG:
6902 		i_mdi_pm_post_config(vdip, client_dip);
6903 		break;
6904 
6905 	case MDI_PM_POST_UNCONFIG:
6906 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6907 		break;
6908 
6909 	case MDI_PM_HOLD_POWER:
6910 	case MDI_PM_RELE_POWER:
6911 		ASSERT(args);
6912 
6913 		client_dip = (dev_info_t *)args;
6914 		ASSERT(MDI_CLIENT(client_dip));
6915 
6916 		ct = i_devi_get_client(client_dip);
6917 		MDI_CLIENT_LOCK(ct);
6918 
6919 		if (op == MDI_PM_HOLD_POWER) {
6920 			if (ct->ct_power_cnt == 0) {
6921 				(void) i_mdi_power_all_phci(ct);
6922 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6923 				    "mdi_power i_mdi_pm_hold_client\n"));
6924 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6925 			}
6926 		} else {
6927 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6928 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6929 				    "mdi_power i_mdi_pm_rele_client\n"));
6930 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6931 			} else {
6932 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6933 				    "mdi_power i_mdi_pm_reset_client\n"));
6934 				i_mdi_pm_reset_client(ct);
6935 			}
6936 		}
6937 
6938 		MDI_CLIENT_UNLOCK(ct);
6939 		break;
6940 
6941 	default:
6942 		break;
6943 	}
6944 
6945 	if (devnm)
6946 		ndi_devi_exit(vdip, circ);
6947 
6948 	return (ret);
6949 }
6950 
6951 int
6952 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6953 {
6954 	mdi_vhci_t *vhci;
6955 
6956 	if (!MDI_VHCI(dip))
6957 		return (MDI_FAILURE);
6958 
6959 	if (mdi_class) {
6960 		vhci = DEVI(dip)->devi_mdi_xhci;
6961 		ASSERT(vhci);
6962 		*mdi_class = vhci->vh_class;
6963 	}
6964 
6965 	return (MDI_SUCCESS);
6966 }
6967 
6968 int
6969 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6970 {
6971 	mdi_phci_t *phci;
6972 
6973 	if (!MDI_PHCI(dip))
6974 		return (MDI_FAILURE);
6975 
6976 	if (mdi_class) {
6977 		phci = DEVI(dip)->devi_mdi_xhci;
6978 		ASSERT(phci);
6979 		*mdi_class = phci->ph_vhci->vh_class;
6980 	}
6981 
6982 	return (MDI_SUCCESS);
6983 }
6984 
6985 int
6986 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6987 {
6988 	mdi_client_t *client;
6989 
6990 	if (!MDI_CLIENT(dip))
6991 		return (MDI_FAILURE);
6992 
6993 	if (mdi_class) {
6994 		client = DEVI(dip)->devi_mdi_client;
6995 		ASSERT(client);
6996 		*mdi_class = client->ct_vhci->vh_class;
6997 	}
6998 
6999 	return (MDI_SUCCESS);
7000 }
7001 
7002 void *
7003 mdi_client_get_vhci_private(dev_info_t *dip)
7004 {
7005 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7006 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7007 		mdi_client_t	*ct;
7008 		ct = i_devi_get_client(dip);
7009 		return (ct->ct_vprivate);
7010 	}
7011 	return (NULL);
7012 }
7013 
7014 void
7015 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
7016 {
7017 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7018 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7019 		mdi_client_t	*ct;
7020 		ct = i_devi_get_client(dip);
7021 		ct->ct_vprivate = data;
7022 	}
7023 }
7024 /*
7025  * mdi_pi_get_vhci_private():
7026  *		Get the vhci private information associated with the
7027  *		mdi_pathinfo node
7028  */
7029 void *
7030 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
7031 {
7032 	caddr_t	vprivate = NULL;
7033 	if (pip) {
7034 		vprivate = MDI_PI(pip)->pi_vprivate;
7035 	}
7036 	return (vprivate);
7037 }
7038 
7039 /*
7040  * mdi_pi_set_vhci_private():
7041  *		Set the vhci private information in the mdi_pathinfo node
7042  */
7043 void
7044 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
7045 {
7046 	if (pip) {
7047 		MDI_PI(pip)->pi_vprivate = priv;
7048 	}
7049 }
7050 
7051 /*
7052  * mdi_phci_get_vhci_private():
7053  *		Get the vhci private information associated with the
7054  *		mdi_phci node
7055  */
7056 void *
7057 mdi_phci_get_vhci_private(dev_info_t *dip)
7058 {
7059 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7060 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7061 		mdi_phci_t	*ph;
7062 		ph = i_devi_get_phci(dip);
7063 		return (ph->ph_vprivate);
7064 	}
7065 	return (NULL);
7066 }
7067 
7068 /*
7069  * mdi_phci_set_vhci_private():
7070  *		Set the vhci private information in the mdi_phci node
7071  */
7072 void
7073 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7074 {
7075 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7076 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7077 		mdi_phci_t	*ph;
7078 		ph = i_devi_get_phci(dip);
7079 		ph->ph_vprivate = priv;
7080 	}
7081 }
7082 
7083 /*
7084  * List of vhci class names:
7085  * A vhci class name must be in this list only if the corresponding vhci
7086  * driver intends to use the mdi provided bus config implementation
7087  * (i.e., mdi_vhci_bus_config()).
7088  */
7089 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7090 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7091 
7092 /*
7093  * During boot time, the on-disk vhci cache for every vhci class is read
7094  * in the form of an nvlist and stored here.
7095  */
7096 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7097 
7098 /* nvpair names in vhci cache nvlist */
7099 #define	MDI_VHCI_CACHE_VERSION	1
7100 #define	MDI_NVPNAME_VERSION	"version"
7101 #define	MDI_NVPNAME_PHCIS	"phcis"
7102 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7103 
7104 /*
7105  * Given vhci class name, return its on-disk vhci cache filename.
7106  * Memory for the returned filename which includes the full path is allocated
7107  * by this function.
7108  */
7109 static char *
7110 vhclass2vhcache_filename(char *vhclass)
7111 {
7112 	char *filename;
7113 	int len;
7114 	static char *fmt = "/etc/devices/mdi_%s_cache";
7115 
7116 	/*
7117 	 * fmt contains the on-disk vhci cache file name format;
7118 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7119 	 */
7120 
7121 	/* the -1 below is to account for "%s" in the format string */
7122 	len = strlen(fmt) + strlen(vhclass) - 1;
7123 	filename = kmem_alloc(len, KM_SLEEP);
7124 	(void) snprintf(filename, len, fmt, vhclass);
7125 	ASSERT(len == (strlen(filename) + 1));
7126 	return (filename);
7127 }
7128 
7129 /*
7130  * initialize the vhci cache related data structures and read the on-disk
7131  * vhci cached data into memory.
7132  */
7133 static void
7134 setup_vhci_cache(mdi_vhci_t *vh)
7135 {
7136 	mdi_vhci_config_t *vhc;
7137 	mdi_vhci_cache_t *vhcache;
7138 	int i;
7139 	nvlist_t *nvl = NULL;
7140 
7141 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7142 	vh->vh_config = vhc;
7143 	vhcache = &vhc->vhc_vhcache;
7144 
7145 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7146 
7147 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7148 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7149 
7150 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7151 
7152 	/*
7153 	 * Create string hash; same as mod_hash_create_strhash() except that
7154 	 * we use NULL key destructor.
7155 	 */
7156 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7157 	    mdi_bus_config_cache_hash_size,
7158 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7159 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7160 
7161 	/*
7162 	 * The on-disk vhci cache is read during booting prior to the
7163 	 * lights-out period by mdi_read_devices_files().
7164 	 */
7165 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7166 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7167 			nvl = vhcache_nvl[i];
7168 			vhcache_nvl[i] = NULL;
7169 			break;
7170 		}
7171 	}
7172 
7173 	/*
7174 	 * this is to cover the case of some one manually causing unloading
7175 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7176 	 */
7177 	if (nvl == NULL && modrootloaded)
7178 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7179 
7180 	if (nvl != NULL) {
7181 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7182 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7183 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7184 		else  {
7185 			cmn_err(CE_WARN,
7186 			    "%s: data file corrupted, will recreate\n",
7187 			    vhc->vhc_vhcache_filename);
7188 		}
7189 		rw_exit(&vhcache->vhcache_lock);
7190 		nvlist_free(nvl);
7191 	}
7192 
7193 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7194 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7195 
7196 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7197 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7198 }
7199 
7200 /*
7201  * free all vhci cache related resources
7202  */
7203 static int
7204 destroy_vhci_cache(mdi_vhci_t *vh)
7205 {
7206 	mdi_vhci_config_t *vhc = vh->vh_config;
7207 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7208 	mdi_vhcache_phci_t *cphci, *cphci_next;
7209 	mdi_vhcache_client_t *cct, *cct_next;
7210 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7211 
7212 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7213 		return (MDI_FAILURE);
7214 
7215 	kmem_free(vhc->vhc_vhcache_filename,
7216 	    strlen(vhc->vhc_vhcache_filename) + 1);
7217 
7218 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7219 
7220 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7221 	    cphci = cphci_next) {
7222 		cphci_next = cphci->cphci_next;
7223 		free_vhcache_phci(cphci);
7224 	}
7225 
7226 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7227 		cct_next = cct->cct_next;
7228 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7229 			cpi_next = cpi->cpi_next;
7230 			free_vhcache_pathinfo(cpi);
7231 		}
7232 		free_vhcache_client(cct);
7233 	}
7234 
7235 	rw_destroy(&vhcache->vhcache_lock);
7236 
7237 	mutex_destroy(&vhc->vhc_lock);
7238 	cv_destroy(&vhc->vhc_cv);
7239 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7240 	return (MDI_SUCCESS);
7241 }
7242 
7243 /*
7244  * Stop all vhci cache related async threads and free their resources.
7245  */
7246 static int
7247 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7248 {
7249 	mdi_async_client_config_t *acc, *acc_next;
7250 
7251 	mutex_enter(&vhc->vhc_lock);
7252 	vhc->vhc_flags |= MDI_VHC_EXIT;
7253 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7254 	cv_broadcast(&vhc->vhc_cv);
7255 
7256 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7257 	    vhc->vhc_acc_thrcount != 0) {
7258 		mutex_exit(&vhc->vhc_lock);
7259 		delay(1);
7260 		mutex_enter(&vhc->vhc_lock);
7261 	}
7262 
7263 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7264 
7265 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7266 		acc_next = acc->acc_next;
7267 		free_async_client_config(acc);
7268 	}
7269 	vhc->vhc_acc_list_head = NULL;
7270 	vhc->vhc_acc_list_tail = NULL;
7271 	vhc->vhc_acc_count = 0;
7272 
7273 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7274 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7275 		mutex_exit(&vhc->vhc_lock);
7276 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7277 			vhcache_dirty(vhc);
7278 			return (MDI_FAILURE);
7279 		}
7280 	} else
7281 		mutex_exit(&vhc->vhc_lock);
7282 
7283 	if (callb_delete(vhc->vhc_cbid) != 0)
7284 		return (MDI_FAILURE);
7285 
7286 	return (MDI_SUCCESS);
7287 }
7288 
7289 /*
7290  * Stop vhci cache flush thread
7291  */
7292 /* ARGSUSED */
7293 static boolean_t
7294 stop_vhcache_flush_thread(void *arg, int code)
7295 {
7296 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7297 
7298 	mutex_enter(&vhc->vhc_lock);
7299 	vhc->vhc_flags |= MDI_VHC_EXIT;
7300 	cv_broadcast(&vhc->vhc_cv);
7301 
7302 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7303 		mutex_exit(&vhc->vhc_lock);
7304 		delay(1);
7305 		mutex_enter(&vhc->vhc_lock);
7306 	}
7307 
7308 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7309 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7310 		mutex_exit(&vhc->vhc_lock);
7311 		(void) flush_vhcache(vhc, 1);
7312 	} else
7313 		mutex_exit(&vhc->vhc_lock);
7314 
7315 	return (B_TRUE);
7316 }
7317 
7318 /*
7319  * Enqueue the vhcache phci (cphci) at the tail of the list
7320  */
7321 static void
7322 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7323 {
7324 	cphci->cphci_next = NULL;
7325 	if (vhcache->vhcache_phci_head == NULL)
7326 		vhcache->vhcache_phci_head = cphci;
7327 	else
7328 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7329 	vhcache->vhcache_phci_tail = cphci;
7330 }
7331 
7332 /*
7333  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7334  */
7335 static void
7336 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7337     mdi_vhcache_pathinfo_t *cpi)
7338 {
7339 	cpi->cpi_next = NULL;
7340 	if (cct->cct_cpi_head == NULL)
7341 		cct->cct_cpi_head = cpi;
7342 	else
7343 		cct->cct_cpi_tail->cpi_next = cpi;
7344 	cct->cct_cpi_tail = cpi;
7345 }
7346 
7347 /*
7348  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7349  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7350  * flag set come at the beginning of the list. All cpis which have this
7351  * flag set come at the end of the list.
7352  */
7353 static void
7354 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7355     mdi_vhcache_pathinfo_t *newcpi)
7356 {
7357 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7358 
7359 	if (cct->cct_cpi_head == NULL ||
7360 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7361 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7362 	else {
7363 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7364 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7365 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7366 			;
7367 
7368 		if (prev_cpi == NULL)
7369 			cct->cct_cpi_head = newcpi;
7370 		else
7371 			prev_cpi->cpi_next = newcpi;
7372 
7373 		newcpi->cpi_next = cpi;
7374 
7375 		if (cpi == NULL)
7376 			cct->cct_cpi_tail = newcpi;
7377 	}
7378 }
7379 
7380 /*
7381  * Enqueue the vhcache client (cct) at the tail of the list
7382  */
7383 static void
7384 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7385     mdi_vhcache_client_t *cct)
7386 {
7387 	cct->cct_next = NULL;
7388 	if (vhcache->vhcache_client_head == NULL)
7389 		vhcache->vhcache_client_head = cct;
7390 	else
7391 		vhcache->vhcache_client_tail->cct_next = cct;
7392 	vhcache->vhcache_client_tail = cct;
7393 }
7394 
7395 static void
7396 free_string_array(char **str, int nelem)
7397 {
7398 	int i;
7399 
7400 	if (str) {
7401 		for (i = 0; i < nelem; i++) {
7402 			if (str[i])
7403 				kmem_free(str[i], strlen(str[i]) + 1);
7404 		}
7405 		kmem_free(str, sizeof (char *) * nelem);
7406 	}
7407 }
7408 
7409 static void
7410 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7411 {
7412 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7413 	kmem_free(cphci, sizeof (*cphci));
7414 }
7415 
7416 static void
7417 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7418 {
7419 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7420 	kmem_free(cpi, sizeof (*cpi));
7421 }
7422 
7423 static void
7424 free_vhcache_client(mdi_vhcache_client_t *cct)
7425 {
7426 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7427 	kmem_free(cct, sizeof (*cct));
7428 }
7429 
7430 static char *
7431 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7432 {
7433 	char *name_addr;
7434 	int len;
7435 
7436 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7437 	name_addr = kmem_alloc(len, KM_SLEEP);
7438 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7439 
7440 	if (ret_len)
7441 		*ret_len = len;
7442 	return (name_addr);
7443 }
7444 
7445 /*
7446  * Copy the contents of paddrnvl to vhci cache.
7447  * paddrnvl nvlist contains path information for a vhci client.
7448  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7449  */
7450 static void
7451 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7452     mdi_vhcache_client_t *cct)
7453 {
7454 	nvpair_t *nvp = NULL;
7455 	mdi_vhcache_pathinfo_t *cpi;
7456 	uint_t nelem;
7457 	uint32_t *val;
7458 
7459 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7460 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7461 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7462 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7463 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7464 		ASSERT(nelem == 2);
7465 		cpi->cpi_cphci = cphci_list[val[0]];
7466 		cpi->cpi_flags = val[1];
7467 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7468 	}
7469 }
7470 
7471 /*
7472  * Copy the contents of caddrmapnvl to vhci cache.
7473  * caddrmapnvl nvlist contains vhci client address to phci client address
7474  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7475  * this nvlist.
7476  */
7477 static void
7478 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7479     mdi_vhcache_phci_t *cphci_list[])
7480 {
7481 	nvpair_t *nvp = NULL;
7482 	nvlist_t *paddrnvl;
7483 	mdi_vhcache_client_t *cct;
7484 
7485 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7486 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7487 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7488 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7489 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7490 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7491 		/* the client must contain at least one path */
7492 		ASSERT(cct->cct_cpi_head != NULL);
7493 
7494 		enqueue_vhcache_client(vhcache, cct);
7495 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7496 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7497 	}
7498 }
7499 
7500 /*
7501  * Copy the contents of the main nvlist to vhci cache.
7502  *
7503  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7504  * The nvlist contains the mappings between the vhci client addresses and
7505  * their corresponding phci client addresses.
7506  *
7507  * The structure of the nvlist is as follows:
7508  *
7509  * Main nvlist:
7510  *	NAME		TYPE		DATA
7511  *	version		int32		version number
7512  *	phcis		string array	array of phci paths
7513  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7514  *
7515  * structure of c2paddrs_nvl:
7516  *	NAME		TYPE		DATA
7517  *	caddr1		nvlist_t	paddrs_nvl1
7518  *	caddr2		nvlist_t	paddrs_nvl2
7519  *	...
7520  * where caddr1, caddr2, ... are vhci client name and addresses in the
7521  * form of "<clientname>@<clientaddress>".
7522  * (for example: "ssd@2000002037cd9f72");
7523  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7524  *
7525  * structure of paddrs_nvl:
7526  *	NAME		TYPE		DATA
7527  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7528  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7529  *	...
7530  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7531  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7532  * phci-ids are integers that identify PHCIs to which the
7533  * the bus specific address belongs to. These integers are used as an index
7534  * into to the phcis string array in the main nvlist to get the PHCI path.
7535  */
7536 static int
7537 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7538 {
7539 	char **phcis, **phci_namep;
7540 	uint_t nphcis;
7541 	mdi_vhcache_phci_t *cphci, **cphci_list;
7542 	nvlist_t *caddrmapnvl;
7543 	int32_t ver;
7544 	int i;
7545 	size_t cphci_list_size;
7546 
7547 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7548 
7549 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7550 	    ver != MDI_VHCI_CACHE_VERSION)
7551 		return (MDI_FAILURE);
7552 
7553 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7554 	    &nphcis) != 0)
7555 		return (MDI_SUCCESS);
7556 
7557 	ASSERT(nphcis > 0);
7558 
7559 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7560 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7561 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7562 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7563 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7564 		enqueue_vhcache_phci(vhcache, cphci);
7565 		cphci_list[i] = cphci;
7566 	}
7567 
7568 	ASSERT(vhcache->vhcache_phci_head != NULL);
7569 
7570 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7571 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7572 
7573 	kmem_free(cphci_list, cphci_list_size);
7574 	return (MDI_SUCCESS);
7575 }
7576 
7577 /*
7578  * Build paddrnvl for the specified client using the information in the
7579  * vhci cache and add it to the caddrmapnnvl.
7580  * Returns 0 on success, errno on failure.
7581  */
7582 static int
7583 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7584     nvlist_t *caddrmapnvl)
7585 {
7586 	mdi_vhcache_pathinfo_t *cpi;
7587 	nvlist_t *nvl;
7588 	int err;
7589 	uint32_t val[2];
7590 
7591 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7592 
7593 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7594 		return (err);
7595 
7596 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7597 		val[0] = cpi->cpi_cphci->cphci_id;
7598 		val[1] = cpi->cpi_flags;
7599 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7600 		    != 0)
7601 			goto out;
7602 	}
7603 
7604 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7605 out:
7606 	nvlist_free(nvl);
7607 	return (err);
7608 }
7609 
7610 /*
7611  * Build caddrmapnvl using the information in the vhci cache
7612  * and add it to the mainnvl.
7613  * Returns 0 on success, errno on failure.
7614  */
7615 static int
7616 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7617 {
7618 	mdi_vhcache_client_t *cct;
7619 	nvlist_t *nvl;
7620 	int err;
7621 
7622 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7623 
7624 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7625 		return (err);
7626 
7627 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7628 	    cct = cct->cct_next) {
7629 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7630 			goto out;
7631 	}
7632 
7633 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7634 out:
7635 	nvlist_free(nvl);
7636 	return (err);
7637 }
7638 
7639 /*
7640  * Build nvlist using the information in the vhci cache.
7641  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7642  * Returns nvl on success, NULL on failure.
7643  */
7644 static nvlist_t *
7645 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7646 {
7647 	mdi_vhcache_phci_t *cphci;
7648 	uint_t phci_count;
7649 	char **phcis;
7650 	nvlist_t *nvl;
7651 	int err, i;
7652 
7653 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7654 		nvl = NULL;
7655 		goto out;
7656 	}
7657 
7658 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7659 	    MDI_VHCI_CACHE_VERSION)) != 0)
7660 		goto out;
7661 
7662 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7663 	if (vhcache->vhcache_phci_head == NULL) {
7664 		rw_exit(&vhcache->vhcache_lock);
7665 		return (nvl);
7666 	}
7667 
7668 	phci_count = 0;
7669 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7670 	    cphci = cphci->cphci_next)
7671 		cphci->cphci_id = phci_count++;
7672 
7673 	/* build phci pathname list */
7674 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7675 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7676 	    cphci = cphci->cphci_next, i++)
7677 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7678 
7679 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7680 	    phci_count);
7681 	free_string_array(phcis, phci_count);
7682 
7683 	if (err == 0 &&
7684 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7685 		rw_exit(&vhcache->vhcache_lock);
7686 		return (nvl);
7687 	}
7688 
7689 	rw_exit(&vhcache->vhcache_lock);
7690 out:
7691 	if (nvl)
7692 		nvlist_free(nvl);
7693 	return (NULL);
7694 }
7695 
7696 /*
7697  * Lookup vhcache phci structure for the specified phci path.
7698  */
7699 static mdi_vhcache_phci_t *
7700 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7701 {
7702 	mdi_vhcache_phci_t *cphci;
7703 
7704 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7705 
7706 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7707 	    cphci = cphci->cphci_next) {
7708 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7709 			return (cphci);
7710 	}
7711 
7712 	return (NULL);
7713 }
7714 
7715 /*
7716  * Lookup vhcache phci structure for the specified phci.
7717  */
7718 static mdi_vhcache_phci_t *
7719 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7720 {
7721 	mdi_vhcache_phci_t *cphci;
7722 
7723 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7724 
7725 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7726 	    cphci = cphci->cphci_next) {
7727 		if (cphci->cphci_phci == ph)
7728 			return (cphci);
7729 	}
7730 
7731 	return (NULL);
7732 }
7733 
7734 /*
7735  * Add the specified phci to the vhci cache if not already present.
7736  */
7737 static void
7738 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7739 {
7740 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7741 	mdi_vhcache_phci_t *cphci;
7742 	char *pathname;
7743 	int cache_updated;
7744 
7745 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7746 
7747 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7748 	(void) ddi_pathname(ph->ph_dip, pathname);
7749 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7750 	    != NULL) {
7751 		cphci->cphci_phci = ph;
7752 		cache_updated = 0;
7753 	} else {
7754 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7755 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7756 		cphci->cphci_phci = ph;
7757 		enqueue_vhcache_phci(vhcache, cphci);
7758 		cache_updated = 1;
7759 	}
7760 
7761 	rw_exit(&vhcache->vhcache_lock);
7762 
7763 	/*
7764 	 * Since a new phci has been added, reset
7765 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7766 	 * during next vhcache_discover_paths().
7767 	 */
7768 	mutex_enter(&vhc->vhc_lock);
7769 	vhc->vhc_path_discovery_cutoff_time = 0;
7770 	mutex_exit(&vhc->vhc_lock);
7771 
7772 	kmem_free(pathname, MAXPATHLEN);
7773 	if (cache_updated)
7774 		vhcache_dirty(vhc);
7775 }
7776 
7777 /*
7778  * Remove the reference to the specified phci from the vhci cache.
7779  */
7780 static void
7781 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7782 {
7783 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7784 	mdi_vhcache_phci_t *cphci;
7785 
7786 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7787 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7788 		/* do not remove the actual mdi_vhcache_phci structure */
7789 		cphci->cphci_phci = NULL;
7790 	}
7791 	rw_exit(&vhcache->vhcache_lock);
7792 }
7793 
7794 static void
7795 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
7796     mdi_vhcache_lookup_token_t *src)
7797 {
7798 	if (src == NULL) {
7799 		dst->lt_cct = NULL;
7800 		dst->lt_cct_lookup_time = 0;
7801 	} else {
7802 		dst->lt_cct = src->lt_cct;
7803 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
7804 	}
7805 }
7806 
7807 /*
7808  * Look up vhcache client for the specified client.
7809  */
7810 static mdi_vhcache_client_t *
7811 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
7812     mdi_vhcache_lookup_token_t *token)
7813 {
7814 	mod_hash_val_t hv;
7815 	char *name_addr;
7816 	int len;
7817 
7818 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7819 
7820 	/*
7821 	 * If no vhcache clean occurred since the last lookup, we can
7822 	 * simply return the cct from the last lookup operation.
7823 	 * It works because ccts are never freed except during the vhcache
7824 	 * cleanup operation.
7825 	 */
7826 	if (token != NULL &&
7827 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
7828 		return (token->lt_cct);
7829 
7830 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
7831 	if (mod_hash_find(vhcache->vhcache_client_hash,
7832 	    (mod_hash_key_t)name_addr, &hv) == 0) {
7833 		if (token) {
7834 			token->lt_cct = (mdi_vhcache_client_t *)hv;
7835 			token->lt_cct_lookup_time = lbolt64;
7836 		}
7837 	} else {
7838 		if (token) {
7839 			token->lt_cct = NULL;
7840 			token->lt_cct_lookup_time = 0;
7841 		}
7842 		hv = NULL;
7843 	}
7844 	kmem_free(name_addr, len);
7845 	return ((mdi_vhcache_client_t *)hv);
7846 }
7847 
7848 /*
7849  * Add the specified path to the vhci cache if not already present.
7850  * Also add the vhcache client for the client corresponding to this path
7851  * if it doesn't already exist.
7852  */
7853 static void
7854 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7855 {
7856 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7857 	mdi_vhcache_client_t *cct;
7858 	mdi_vhcache_pathinfo_t *cpi;
7859 	mdi_phci_t *ph = pip->pi_phci;
7860 	mdi_client_t *ct = pip->pi_client;
7861 	int cache_updated = 0;
7862 
7863 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7864 
7865 	/* if vhcache client for this pip doesn't already exist, add it */
7866 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7867 	    NULL)) == NULL) {
7868 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7869 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
7870 		    ct->ct_guid, NULL);
7871 		enqueue_vhcache_client(vhcache, cct);
7872 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7873 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7874 		cache_updated = 1;
7875 	}
7876 
7877 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7878 		if (cpi->cpi_cphci->cphci_phci == ph &&
7879 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
7880 			cpi->cpi_pip = pip;
7881 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
7882 				cpi->cpi_flags &=
7883 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7884 				sort_vhcache_paths(cct);
7885 				cache_updated = 1;
7886 			}
7887 			break;
7888 		}
7889 	}
7890 
7891 	if (cpi == NULL) {
7892 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7893 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
7894 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
7895 		ASSERT(cpi->cpi_cphci != NULL);
7896 		cpi->cpi_pip = pip;
7897 		enqueue_vhcache_pathinfo(cct, cpi);
7898 		cache_updated = 1;
7899 	}
7900 
7901 	rw_exit(&vhcache->vhcache_lock);
7902 
7903 	if (cache_updated)
7904 		vhcache_dirty(vhc);
7905 }
7906 
7907 /*
7908  * Remove the reference to the specified path from the vhci cache.
7909  */
7910 static void
7911 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7912 {
7913 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7914 	mdi_client_t *ct = pip->pi_client;
7915 	mdi_vhcache_client_t *cct;
7916 	mdi_vhcache_pathinfo_t *cpi;
7917 
7918 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7919 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7920 	    NULL)) != NULL) {
7921 		for (cpi = cct->cct_cpi_head; cpi != NULL;
7922 		    cpi = cpi->cpi_next) {
7923 			if (cpi->cpi_pip == pip) {
7924 				cpi->cpi_pip = NULL;
7925 				break;
7926 			}
7927 		}
7928 	}
7929 	rw_exit(&vhcache->vhcache_lock);
7930 }
7931 
7932 /*
7933  * Flush the vhci cache to disk.
7934  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
7935  */
7936 static int
7937 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
7938 {
7939 	nvlist_t *nvl;
7940 	int err;
7941 	int rv;
7942 
7943 	/*
7944 	 * It is possible that the system may shutdown before
7945 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
7946 	 * flushing the cache in this case do not check for
7947 	 * i_ddi_io_initialized when force flag is set.
7948 	 */
7949 	if (force_flag == 0 && !i_ddi_io_initialized())
7950 		return (MDI_FAILURE);
7951 
7952 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
7953 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
7954 		nvlist_free(nvl);
7955 	} else
7956 		err = EFAULT;
7957 
7958 	rv = MDI_SUCCESS;
7959 	mutex_enter(&vhc->vhc_lock);
7960 	if (err != 0) {
7961 		if (err == EROFS) {
7962 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
7963 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
7964 			    MDI_VHC_VHCACHE_DIRTY);
7965 		} else {
7966 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
7967 				cmn_err(CE_CONT, "%s: update failed\n",
7968 				    vhc->vhc_vhcache_filename);
7969 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
7970 			}
7971 			rv = MDI_FAILURE;
7972 		}
7973 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
7974 		cmn_err(CE_CONT,
7975 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
7976 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
7977 	}
7978 	mutex_exit(&vhc->vhc_lock);
7979 
7980 	return (rv);
7981 }
7982 
7983 /*
7984  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
7985  * Exits itself if left idle for the idle timeout period.
7986  */
7987 static void
7988 vhcache_flush_thread(void *arg)
7989 {
7990 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7991 	clock_t idle_time, quit_at_ticks;
7992 	callb_cpr_t cprinfo;
7993 
7994 	/* number of seconds to sleep idle before exiting */
7995 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
7996 
7997 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7998 	    "mdi_vhcache_flush");
7999 	mutex_enter(&vhc->vhc_lock);
8000 	for (; ; ) {
8001 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8002 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
8003 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
8004 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
8005 				(void) cv_timedwait(&vhc->vhc_cv,
8006 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
8007 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8008 			} else {
8009 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
8010 				mutex_exit(&vhc->vhc_lock);
8011 
8012 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
8013 					vhcache_dirty(vhc);
8014 
8015 				mutex_enter(&vhc->vhc_lock);
8016 			}
8017 		}
8018 
8019 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8020 
8021 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8022 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
8023 		    ddi_get_lbolt() < quit_at_ticks) {
8024 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8025 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8026 			    quit_at_ticks);
8027 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8028 		}
8029 
8030 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8031 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
8032 			goto out;
8033 	}
8034 
8035 out:
8036 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
8037 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8038 	CALLB_CPR_EXIT(&cprinfo);
8039 }
8040 
8041 /*
8042  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
8043  */
8044 static void
8045 vhcache_dirty(mdi_vhci_config_t *vhc)
8046 {
8047 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8048 	int create_thread;
8049 
8050 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8051 	/* do not flush cache until the cache is fully built */
8052 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8053 		rw_exit(&vhcache->vhcache_lock);
8054 		return;
8055 	}
8056 	rw_exit(&vhcache->vhcache_lock);
8057 
8058 	mutex_enter(&vhc->vhc_lock);
8059 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8060 		mutex_exit(&vhc->vhc_lock);
8061 		return;
8062 	}
8063 
8064 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8065 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8066 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8067 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8068 		cv_broadcast(&vhc->vhc_cv);
8069 		create_thread = 0;
8070 	} else {
8071 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8072 		create_thread = 1;
8073 	}
8074 	mutex_exit(&vhc->vhc_lock);
8075 
8076 	if (create_thread)
8077 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8078 		    0, &p0, TS_RUN, minclsyspri);
8079 }
8080 
8081 /*
8082  * phci bus config structure - one for for each phci bus config operation that
8083  * we initiate on behalf of a vhci.
8084  */
8085 typedef struct mdi_phci_bus_config_s {
8086 	char *phbc_phci_path;
8087 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8088 	struct mdi_phci_bus_config_s *phbc_next;
8089 } mdi_phci_bus_config_t;
8090 
8091 /* vhci bus config structure - one for each vhci bus config operation */
8092 typedef struct mdi_vhci_bus_config_s {
8093 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8094 	major_t vhbc_op_major;		/* bus config op major */
8095 	uint_t vhbc_op_flags;		/* bus config op flags */
8096 	kmutex_t vhbc_lock;
8097 	kcondvar_t vhbc_cv;
8098 	int vhbc_thr_count;
8099 } mdi_vhci_bus_config_t;
8100 
8101 /*
8102  * bus config the specified phci
8103  */
8104 static void
8105 bus_config_phci(void *arg)
8106 {
8107 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8108 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8109 	dev_info_t *ph_dip;
8110 
8111 	/*
8112 	 * first configure all path components upto phci and then configure
8113 	 * the phci children.
8114 	 */
8115 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8116 	    != NULL) {
8117 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8118 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8119 			(void) ndi_devi_config_driver(ph_dip,
8120 			    vhbc->vhbc_op_flags,
8121 			    vhbc->vhbc_op_major);
8122 		} else
8123 			(void) ndi_devi_config(ph_dip,
8124 			    vhbc->vhbc_op_flags);
8125 
8126 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8127 		ndi_rele_devi(ph_dip);
8128 	}
8129 
8130 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8131 	kmem_free(phbc, sizeof (*phbc));
8132 
8133 	mutex_enter(&vhbc->vhbc_lock);
8134 	vhbc->vhbc_thr_count--;
8135 	if (vhbc->vhbc_thr_count == 0)
8136 		cv_broadcast(&vhbc->vhbc_cv);
8137 	mutex_exit(&vhbc->vhbc_lock);
8138 }
8139 
8140 /*
8141  * Bus config all phcis associated with the vhci in parallel.
8142  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8143  */
8144 static void
8145 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8146     ddi_bus_config_op_t op, major_t maj)
8147 {
8148 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8149 	mdi_vhci_bus_config_t *vhbc;
8150 	mdi_vhcache_phci_t *cphci;
8151 
8152 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8153 	if (vhcache->vhcache_phci_head == NULL) {
8154 		rw_exit(&vhcache->vhcache_lock);
8155 		return;
8156 	}
8157 
8158 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8159 
8160 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8161 	    cphci = cphci->cphci_next) {
8162 		/* skip phcis that haven't attached before root is available */
8163 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8164 			continue;
8165 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8166 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8167 		    KM_SLEEP);
8168 		phbc->phbc_vhbusconfig = vhbc;
8169 		phbc->phbc_next = phbc_head;
8170 		phbc_head = phbc;
8171 		vhbc->vhbc_thr_count++;
8172 	}
8173 	rw_exit(&vhcache->vhcache_lock);
8174 
8175 	vhbc->vhbc_op = op;
8176 	vhbc->vhbc_op_major = maj;
8177 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8178 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8179 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8180 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8181 
8182 	/* now create threads to initiate bus config on all phcis in parallel */
8183 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8184 		phbc_next = phbc->phbc_next;
8185 		if (mdi_mtc_off)
8186 			bus_config_phci((void *)phbc);
8187 		else
8188 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8189 			    0, &p0, TS_RUN, minclsyspri);
8190 	}
8191 
8192 	mutex_enter(&vhbc->vhbc_lock);
8193 	/* wait until all threads exit */
8194 	while (vhbc->vhbc_thr_count > 0)
8195 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8196 	mutex_exit(&vhbc->vhbc_lock);
8197 
8198 	mutex_destroy(&vhbc->vhbc_lock);
8199 	cv_destroy(&vhbc->vhbc_cv);
8200 	kmem_free(vhbc, sizeof (*vhbc));
8201 }
8202 
8203 /*
8204  * Single threaded version of bus_config_all_phcis()
8205  */
8206 static void
8207 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8208     ddi_bus_config_op_t op, major_t maj)
8209 {
8210 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8211 
8212 	single_threaded_vhconfig_enter(vhc);
8213 	bus_config_all_phcis(vhcache, flags, op, maj);
8214 	single_threaded_vhconfig_exit(vhc);
8215 }
8216 
8217 /*
8218  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8219  * The path includes the child component in addition to the phci path.
8220  */
8221 static int
8222 bus_config_one_phci_child(char *path)
8223 {
8224 	dev_info_t *ph_dip, *child;
8225 	char *devnm;
8226 	int rv = MDI_FAILURE;
8227 
8228 	/* extract the child component of the phci */
8229 	devnm = strrchr(path, '/');
8230 	*devnm++ = '\0';
8231 
8232 	/*
8233 	 * first configure all path components upto phci and then
8234 	 * configure the phci child.
8235 	 */
8236 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8237 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8238 		    NDI_SUCCESS) {
8239 			/*
8240 			 * release the hold that ndi_devi_config_one() placed
8241 			 */
8242 			ndi_rele_devi(child);
8243 			rv = MDI_SUCCESS;
8244 		}
8245 
8246 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8247 		ndi_rele_devi(ph_dip);
8248 	}
8249 
8250 	devnm--;
8251 	*devnm = '/';
8252 	return (rv);
8253 }
8254 
8255 /*
8256  * Build a list of phci client paths for the specified vhci client.
8257  * The list includes only those phci client paths which aren't configured yet.
8258  */
8259 static mdi_phys_path_t *
8260 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8261 {
8262 	mdi_vhcache_pathinfo_t *cpi;
8263 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8264 	int config_path, len;
8265 
8266 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8267 		/*
8268 		 * include only those paths that aren't configured.
8269 		 */
8270 		config_path = 0;
8271 		if (cpi->cpi_pip == NULL)
8272 			config_path = 1;
8273 		else {
8274 			MDI_PI_LOCK(cpi->cpi_pip);
8275 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8276 				config_path = 1;
8277 			MDI_PI_UNLOCK(cpi->cpi_pip);
8278 		}
8279 
8280 		if (config_path) {
8281 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8282 			len = strlen(cpi->cpi_cphci->cphci_path) +
8283 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8284 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8285 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8286 			    cpi->cpi_cphci->cphci_path, ct_name,
8287 			    cpi->cpi_addr);
8288 			pp->phys_path_next = NULL;
8289 
8290 			if (pp_head == NULL)
8291 				pp_head = pp;
8292 			else
8293 				pp_tail->phys_path_next = pp;
8294 			pp_tail = pp;
8295 		}
8296 	}
8297 
8298 	return (pp_head);
8299 }
8300 
8301 /*
8302  * Free the memory allocated for phci client path list.
8303  */
8304 static void
8305 free_phclient_path_list(mdi_phys_path_t *pp_head)
8306 {
8307 	mdi_phys_path_t *pp, *pp_next;
8308 
8309 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8310 		pp_next = pp->phys_path_next;
8311 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8312 		kmem_free(pp, sizeof (*pp));
8313 	}
8314 }
8315 
8316 /*
8317  * Allocated async client structure and initialize with the specified values.
8318  */
8319 static mdi_async_client_config_t *
8320 alloc_async_client_config(char *ct_name, char *ct_addr,
8321     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8322 {
8323 	mdi_async_client_config_t *acc;
8324 
8325 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8326 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8327 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8328 	acc->acc_phclient_path_list_head = pp_head;
8329 	init_vhcache_lookup_token(&acc->acc_token, tok);
8330 	acc->acc_next = NULL;
8331 	return (acc);
8332 }
8333 
8334 /*
8335  * Free the memory allocated for the async client structure and their members.
8336  */
8337 static void
8338 free_async_client_config(mdi_async_client_config_t *acc)
8339 {
8340 	if (acc->acc_phclient_path_list_head)
8341 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8342 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8343 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8344 	kmem_free(acc, sizeof (*acc));
8345 }
8346 
8347 /*
8348  * Sort vhcache pathinfos (cpis) of the specified client.
8349  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8350  * flag set come at the beginning of the list. All cpis which have this
8351  * flag set come at the end of the list.
8352  */
8353 static void
8354 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8355 {
8356 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8357 
8358 	cpi_head = cct->cct_cpi_head;
8359 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8360 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8361 		cpi_next = cpi->cpi_next;
8362 		enqueue_vhcache_pathinfo(cct, cpi);
8363 	}
8364 }
8365 
8366 /*
8367  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8368  * every vhcache pathinfo of the specified client. If not adjust the flag
8369  * setting appropriately.
8370  *
8371  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8372  * on-disk vhci cache. So every time this flag is updated the cache must be
8373  * flushed.
8374  */
8375 static void
8376 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8377     mdi_vhcache_lookup_token_t *tok)
8378 {
8379 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8380 	mdi_vhcache_client_t *cct;
8381 	mdi_vhcache_pathinfo_t *cpi;
8382 
8383 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8384 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8385 	    == NULL) {
8386 		rw_exit(&vhcache->vhcache_lock);
8387 		return;
8388 	}
8389 
8390 	/*
8391 	 * to avoid unnecessary on-disk cache updates, first check if an
8392 	 * update is really needed. If no update is needed simply return.
8393 	 */
8394 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8395 		if ((cpi->cpi_pip != NULL &&
8396 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8397 		    (cpi->cpi_pip == NULL &&
8398 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8399 			break;
8400 		}
8401 	}
8402 	if (cpi == NULL) {
8403 		rw_exit(&vhcache->vhcache_lock);
8404 		return;
8405 	}
8406 
8407 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8408 		rw_exit(&vhcache->vhcache_lock);
8409 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8410 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8411 		    tok)) == NULL) {
8412 			rw_exit(&vhcache->vhcache_lock);
8413 			return;
8414 		}
8415 	}
8416 
8417 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8418 		if (cpi->cpi_pip != NULL)
8419 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8420 		else
8421 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8422 	}
8423 	sort_vhcache_paths(cct);
8424 
8425 	rw_exit(&vhcache->vhcache_lock);
8426 	vhcache_dirty(vhc);
8427 }
8428 
8429 /*
8430  * Configure all specified paths of the client.
8431  */
8432 static void
8433 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8434     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8435 {
8436 	mdi_phys_path_t *pp;
8437 
8438 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8439 		(void) bus_config_one_phci_child(pp->phys_path);
8440 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8441 }
8442 
8443 /*
8444  * Dequeue elements from vhci async client config list and bus configure
8445  * their corresponding phci clients.
8446  */
8447 static void
8448 config_client_paths_thread(void *arg)
8449 {
8450 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8451 	mdi_async_client_config_t *acc;
8452 	clock_t quit_at_ticks;
8453 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8454 	callb_cpr_t cprinfo;
8455 
8456 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8457 	    "mdi_config_client_paths");
8458 
8459 	for (; ; ) {
8460 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8461 
8462 		mutex_enter(&vhc->vhc_lock);
8463 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8464 		    vhc->vhc_acc_list_head == NULL &&
8465 		    ddi_get_lbolt() < quit_at_ticks) {
8466 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8467 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8468 			    quit_at_ticks);
8469 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8470 		}
8471 
8472 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8473 		    vhc->vhc_acc_list_head == NULL)
8474 			goto out;
8475 
8476 		acc = vhc->vhc_acc_list_head;
8477 		vhc->vhc_acc_list_head = acc->acc_next;
8478 		if (vhc->vhc_acc_list_head == NULL)
8479 			vhc->vhc_acc_list_tail = NULL;
8480 		vhc->vhc_acc_count--;
8481 		mutex_exit(&vhc->vhc_lock);
8482 
8483 		config_client_paths_sync(vhc, acc->acc_ct_name,
8484 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8485 		    &acc->acc_token);
8486 
8487 		free_async_client_config(acc);
8488 	}
8489 
8490 out:
8491 	vhc->vhc_acc_thrcount--;
8492 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8493 	CALLB_CPR_EXIT(&cprinfo);
8494 }
8495 
8496 /*
8497  * Arrange for all the phci client paths (pp_head) for the specified client
8498  * to be bus configured asynchronously by a thread.
8499  */
8500 static void
8501 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8502     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8503 {
8504 	mdi_async_client_config_t *acc, *newacc;
8505 	int create_thread;
8506 
8507 	if (pp_head == NULL)
8508 		return;
8509 
8510 	if (mdi_mtc_off) {
8511 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8512 		free_phclient_path_list(pp_head);
8513 		return;
8514 	}
8515 
8516 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8517 	ASSERT(newacc);
8518 
8519 	mutex_enter(&vhc->vhc_lock);
8520 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8521 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8522 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8523 			free_async_client_config(newacc);
8524 			mutex_exit(&vhc->vhc_lock);
8525 			return;
8526 		}
8527 	}
8528 
8529 	if (vhc->vhc_acc_list_head == NULL)
8530 		vhc->vhc_acc_list_head = newacc;
8531 	else
8532 		vhc->vhc_acc_list_tail->acc_next = newacc;
8533 	vhc->vhc_acc_list_tail = newacc;
8534 	vhc->vhc_acc_count++;
8535 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8536 		cv_broadcast(&vhc->vhc_cv);
8537 		create_thread = 0;
8538 	} else {
8539 		vhc->vhc_acc_thrcount++;
8540 		create_thread = 1;
8541 	}
8542 	mutex_exit(&vhc->vhc_lock);
8543 
8544 	if (create_thread)
8545 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8546 		    0, &p0, TS_RUN, minclsyspri);
8547 }
8548 
8549 /*
8550  * Return number of online paths for the specified client.
8551  */
8552 static int
8553 nonline_paths(mdi_vhcache_client_t *cct)
8554 {
8555 	mdi_vhcache_pathinfo_t *cpi;
8556 	int online_count = 0;
8557 
8558 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8559 		if (cpi->cpi_pip != NULL) {
8560 			MDI_PI_LOCK(cpi->cpi_pip);
8561 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8562 				online_count++;
8563 			MDI_PI_UNLOCK(cpi->cpi_pip);
8564 		}
8565 	}
8566 
8567 	return (online_count);
8568 }
8569 
8570 /*
8571  * Bus configure all paths for the specified vhci client.
8572  * If at least one path for the client is already online, the remaining paths
8573  * will be configured asynchronously. Otherwise, it synchronously configures
8574  * the paths until at least one path is online and then rest of the paths
8575  * will be configured asynchronously.
8576  */
8577 static void
8578 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8579 {
8580 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8581 	mdi_phys_path_t *pp_head, *pp;
8582 	mdi_vhcache_client_t *cct;
8583 	mdi_vhcache_lookup_token_t tok;
8584 
8585 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8586 
8587 	init_vhcache_lookup_token(&tok, NULL);
8588 
8589 	if (ct_name == NULL || ct_addr == NULL ||
8590 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8591 	    == NULL ||
8592 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8593 		rw_exit(&vhcache->vhcache_lock);
8594 		return;
8595 	}
8596 
8597 	/* if at least one path is online, configure the rest asynchronously */
8598 	if (nonline_paths(cct) > 0) {
8599 		rw_exit(&vhcache->vhcache_lock);
8600 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8601 		return;
8602 	}
8603 
8604 	rw_exit(&vhcache->vhcache_lock);
8605 
8606 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8607 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8608 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8609 
8610 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8611 			    ct_addr, &tok)) == NULL) {
8612 				rw_exit(&vhcache->vhcache_lock);
8613 				goto out;
8614 			}
8615 
8616 			if (nonline_paths(cct) > 0 &&
8617 			    pp->phys_path_next != NULL) {
8618 				rw_exit(&vhcache->vhcache_lock);
8619 				config_client_paths_async(vhc, ct_name, ct_addr,
8620 				    pp->phys_path_next, &tok);
8621 				pp->phys_path_next = NULL;
8622 				goto out;
8623 			}
8624 
8625 			rw_exit(&vhcache->vhcache_lock);
8626 		}
8627 	}
8628 
8629 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8630 out:
8631 	free_phclient_path_list(pp_head);
8632 }
8633 
8634 static void
8635 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8636 {
8637 	mutex_enter(&vhc->vhc_lock);
8638 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8639 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8640 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8641 	mutex_exit(&vhc->vhc_lock);
8642 }
8643 
8644 static void
8645 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8646 {
8647 	mutex_enter(&vhc->vhc_lock);
8648 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8649 	cv_broadcast(&vhc->vhc_cv);
8650 	mutex_exit(&vhc->vhc_lock);
8651 }
8652 
8653 typedef struct mdi_phci_driver_info {
8654 	char	*phdriver_name;	/* name of the phci driver */
8655 
8656 	/* set to non zero if the phci driver supports root device */
8657 	int	phdriver_root_support;
8658 } mdi_phci_driver_info_t;
8659 
8660 /*
8661  * vhci class and root support capability of a phci driver can be
8662  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8663  * phci driver.conf file. The built-in tables below contain this information
8664  * for those phci drivers whose driver.conf files don't yet contain this info.
8665  *
8666  * All phci drivers expect iscsi have root device support.
8667  */
8668 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8669 	{ "fp", 1 },
8670 	{ "iscsi", 0 },
8671 	{ "ibsrp", 1 }
8672 	};
8673 
8674 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8675 
8676 static void *
8677 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8678 {
8679 	void *new_ptr;
8680 
8681 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8682 	if (old_ptr) {
8683 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8684 		kmem_free(old_ptr, old_size);
8685 	}
8686 	return (new_ptr);
8687 }
8688 
8689 static void
8690 add_to_phci_list(char ***driver_list, int **root_support_list,
8691     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8692 {
8693 	ASSERT(*cur_elements <= *max_elements);
8694 	if (*cur_elements == *max_elements) {
8695 		*max_elements += 10;
8696 		*driver_list = mdi_realloc(*driver_list,
8697 		    sizeof (char *) * (*cur_elements),
8698 		    sizeof (char *) * (*max_elements));
8699 		*root_support_list = mdi_realloc(*root_support_list,
8700 		    sizeof (int) * (*cur_elements),
8701 		    sizeof (int) * (*max_elements));
8702 	}
8703 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8704 	(*root_support_list)[*cur_elements] = root_support;
8705 	(*cur_elements)++;
8706 }
8707 
8708 static void
8709 get_phci_driver_list(char *vhci_class, char ***driver_list,
8710     int **root_support_list, int *cur_elements, int *max_elements)
8711 {
8712 	mdi_phci_driver_info_t	*st_driver_list, *p;
8713 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8714 	major_t		m;
8715 	struct devnames	*dnp;
8716 	ddi_prop_t	*propp;
8717 
8718 	*driver_list = NULL;
8719 	*root_support_list = NULL;
8720 	*cur_elements = 0;
8721 	*max_elements = 0;
8722 
8723 	/* add the phci drivers derived from the phci driver.conf files */
8724 	for (m = 0; m < devcnt; m++) {
8725 		dnp = &devnamesp[m];
8726 
8727 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8728 			LOCK_DEV_OPS(&dnp->dn_lock);
8729 			if (dnp->dn_global_prop_ptr != NULL &&
8730 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8731 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8732 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8733 			    strcmp(propp->prop_val, vhci_class) == 0) {
8734 
8735 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8736 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8737 				    &dnp->dn_global_prop_ptr->prop_list)
8738 				    == NULL) ? 1 : 0;
8739 
8740 				add_to_phci_list(driver_list, root_support_list,
8741 				    cur_elements, max_elements, dnp->dn_name,
8742 				    root_support);
8743 
8744 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8745 			} else
8746 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8747 		}
8748 	}
8749 
8750 	driver_conf_count = *cur_elements;
8751 
8752 	/* add the phci drivers specified in the built-in tables */
8753 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8754 		st_driver_list = scsi_phci_driver_list;
8755 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8756 		    sizeof (mdi_phci_driver_info_t);
8757 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8758 		st_driver_list = ib_phci_driver_list;
8759 		st_ndrivers = sizeof (ib_phci_driver_list) /
8760 		    sizeof (mdi_phci_driver_info_t);
8761 	} else {
8762 		st_driver_list = NULL;
8763 		st_ndrivers = 0;
8764 	}
8765 
8766 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8767 		/* add this phci driver if not already added before */
8768 		for (j = 0; j < driver_conf_count; j++) {
8769 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8770 				break;
8771 		}
8772 		if (j == driver_conf_count) {
8773 			add_to_phci_list(driver_list, root_support_list,
8774 			    cur_elements, max_elements, p->phdriver_name,
8775 			    p->phdriver_root_support);
8776 		}
8777 	}
8778 }
8779 
8780 /*
8781  * Attach the phci driver instances associated with the specified vhci class.
8782  * If root is mounted attach all phci driver instances.
8783  * If root is not mounted, attach the instances of only those phci
8784  * drivers that have the root support.
8785  */
8786 static void
8787 attach_phci_drivers(char *vhci_class)
8788 {
8789 	char	**driver_list, **p;
8790 	int	*root_support_list;
8791 	int	cur_elements, max_elements, i;
8792 	major_t	m;
8793 
8794 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
8795 	    &cur_elements, &max_elements);
8796 
8797 	for (i = 0; i < cur_elements; i++) {
8798 		if (modrootloaded || root_support_list[i]) {
8799 			m = ddi_name_to_major(driver_list[i]);
8800 			if (m != DDI_MAJOR_T_NONE &&
8801 			    ddi_hold_installed_driver(m))
8802 				ddi_rele_driver(m);
8803 		}
8804 	}
8805 
8806 	if (driver_list) {
8807 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
8808 			kmem_free(*p, strlen(*p) + 1);
8809 		kmem_free(driver_list, sizeof (char *) * max_elements);
8810 		kmem_free(root_support_list, sizeof (int) * max_elements);
8811 	}
8812 }
8813 
8814 /*
8815  * Build vhci cache:
8816  *
8817  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
8818  * the phci driver instances. During this process the cache gets built.
8819  *
8820  * Cache is built fully if the root is mounted.
8821  * If the root is not mounted, phci drivers that do not have root support
8822  * are not attached. As a result the cache is built partially. The entries
8823  * in the cache reflect only those phci drivers that have root support.
8824  */
8825 static int
8826 build_vhci_cache(mdi_vhci_t *vh)
8827 {
8828 	mdi_vhci_config_t *vhc = vh->vh_config;
8829 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8830 
8831 	single_threaded_vhconfig_enter(vhc);
8832 
8833 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8834 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
8835 		rw_exit(&vhcache->vhcache_lock);
8836 		single_threaded_vhconfig_exit(vhc);
8837 		return (0);
8838 	}
8839 	rw_exit(&vhcache->vhcache_lock);
8840 
8841 	attach_phci_drivers(vh->vh_class);
8842 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
8843 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
8844 
8845 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8846 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
8847 	rw_exit(&vhcache->vhcache_lock);
8848 
8849 	single_threaded_vhconfig_exit(vhc);
8850 	vhcache_dirty(vhc);
8851 	return (1);
8852 }
8853 
8854 /*
8855  * Determine if discovery of paths is needed.
8856  */
8857 static int
8858 vhcache_do_discovery(mdi_vhci_config_t *vhc)
8859 {
8860 	int rv = 1;
8861 
8862 	mutex_enter(&vhc->vhc_lock);
8863 	if (i_ddi_io_initialized() == 0) {
8864 		if (vhc->vhc_path_discovery_boot > 0) {
8865 			vhc->vhc_path_discovery_boot--;
8866 			goto out;
8867 		}
8868 	} else {
8869 		if (vhc->vhc_path_discovery_postboot > 0) {
8870 			vhc->vhc_path_discovery_postboot--;
8871 			goto out;
8872 		}
8873 	}
8874 
8875 	/*
8876 	 * Do full path discovery at most once per mdi_path_discovery_interval.
8877 	 * This is to avoid a series of full path discoveries when opening
8878 	 * stale /dev/[r]dsk links.
8879 	 */
8880 	if (mdi_path_discovery_interval != -1 &&
8881 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
8882 		goto out;
8883 
8884 	rv = 0;
8885 out:
8886 	mutex_exit(&vhc->vhc_lock);
8887 	return (rv);
8888 }
8889 
8890 /*
8891  * Discover all paths:
8892  *
8893  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
8894  * driver instances. During this process all paths will be discovered.
8895  */
8896 static int
8897 vhcache_discover_paths(mdi_vhci_t *vh)
8898 {
8899 	mdi_vhci_config_t *vhc = vh->vh_config;
8900 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8901 	int rv = 0;
8902 
8903 	single_threaded_vhconfig_enter(vhc);
8904 
8905 	if (vhcache_do_discovery(vhc)) {
8906 		attach_phci_drivers(vh->vh_class);
8907 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
8908 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
8909 
8910 		mutex_enter(&vhc->vhc_lock);
8911 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
8912 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
8913 		mutex_exit(&vhc->vhc_lock);
8914 		rv = 1;
8915 	}
8916 
8917 	single_threaded_vhconfig_exit(vhc);
8918 	return (rv);
8919 }
8920 
8921 /*
8922  * Generic vhci bus config implementation:
8923  *
8924  * Parameters
8925  *	vdip	vhci dip
8926  *	flags	bus config flags
8927  *	op	bus config operation
8928  *	The remaining parameters are bus config operation specific
8929  *
8930  * for BUS_CONFIG_ONE
8931  *	arg	pointer to name@addr
8932  *	child	upon successful return from this function, *child will be
8933  *		set to the configured and held devinfo child node of vdip.
8934  *	ct_addr	pointer to client address (i.e. GUID)
8935  *
8936  * for BUS_CONFIG_DRIVER
8937  *	arg	major number of the driver
8938  *	child and ct_addr parameters are ignored
8939  *
8940  * for BUS_CONFIG_ALL
8941  *	arg, child, and ct_addr parameters are ignored
8942  *
8943  * Note that for the rest of the bus config operations, this function simply
8944  * calls the framework provided default bus config routine.
8945  */
8946 int
8947 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
8948     void *arg, dev_info_t **child, char *ct_addr)
8949 {
8950 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
8951 	mdi_vhci_config_t *vhc = vh->vh_config;
8952 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8953 	int rv = 0;
8954 	int params_valid = 0;
8955 	char *cp;
8956 
8957 	/*
8958 	 * To bus config vhcis we relay operation, possibly using another
8959 	 * thread, to phcis. The phci driver then interacts with MDI to cause
8960 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
8961 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
8962 	 * thread may be adding the child, to avoid deadlock we can't wait
8963 	 * for the relayed operations to complete if we have already entered
8964 	 * the vhci node.
8965 	 */
8966 	if (DEVI_BUSY_OWNED(vdip)) {
8967 		MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: vhci bus config: "
8968 		    "vhci dip is busy owned %p\n", (void *)vdip));
8969 		goto default_bus_config;
8970 	}
8971 
8972 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8973 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8974 		rw_exit(&vhcache->vhcache_lock);
8975 		rv = build_vhci_cache(vh);
8976 		rw_enter(&vhcache->vhcache_lock, RW_READER);
8977 	}
8978 
8979 	switch (op) {
8980 	case BUS_CONFIG_ONE:
8981 		if (arg != NULL && ct_addr != NULL) {
8982 			/* extract node name */
8983 			cp = (char *)arg;
8984 			while (*cp != '\0' && *cp != '@')
8985 				cp++;
8986 			if (*cp == '@') {
8987 				params_valid = 1;
8988 				*cp = '\0';
8989 				config_client_paths(vhc, (char *)arg, ct_addr);
8990 				/* config_client_paths() releases cache_lock */
8991 				*cp = '@';
8992 				break;
8993 			}
8994 		}
8995 
8996 		rw_exit(&vhcache->vhcache_lock);
8997 		break;
8998 
8999 	case BUS_CONFIG_DRIVER:
9000 		rw_exit(&vhcache->vhcache_lock);
9001 		if (rv == 0)
9002 			st_bus_config_all_phcis(vhc, flags, op,
9003 			    (major_t)(uintptr_t)arg);
9004 		break;
9005 
9006 	case BUS_CONFIG_ALL:
9007 		rw_exit(&vhcache->vhcache_lock);
9008 		if (rv == 0)
9009 			st_bus_config_all_phcis(vhc, flags, op, -1);
9010 		break;
9011 
9012 	default:
9013 		rw_exit(&vhcache->vhcache_lock);
9014 		break;
9015 	}
9016 
9017 
9018 default_bus_config:
9019 	/*
9020 	 * All requested child nodes are enumerated under the vhci.
9021 	 * Now configure them.
9022 	 */
9023 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9024 	    NDI_SUCCESS) {
9025 		return (MDI_SUCCESS);
9026 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
9027 		/* discover all paths and try configuring again */
9028 		if (vhcache_discover_paths(vh) &&
9029 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9030 		    NDI_SUCCESS)
9031 			return (MDI_SUCCESS);
9032 	}
9033 
9034 	return (MDI_FAILURE);
9035 }
9036 
9037 /*
9038  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
9039  */
9040 static nvlist_t *
9041 read_on_disk_vhci_cache(char *vhci_class)
9042 {
9043 	nvlist_t *nvl;
9044 	int err;
9045 	char *filename;
9046 
9047 	filename = vhclass2vhcache_filename(vhci_class);
9048 
9049 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9050 		kmem_free(filename, strlen(filename) + 1);
9051 		return (nvl);
9052 	} else if (err == EIO)
9053 		cmn_err(CE_WARN, "%s: I/O error, will recreate\n", filename);
9054 	else if (err == EINVAL)
9055 		cmn_err(CE_WARN,
9056 		    "%s: data file corrupted, will recreate\n", filename);
9057 
9058 	kmem_free(filename, strlen(filename) + 1);
9059 	return (NULL);
9060 }
9061 
9062 /*
9063  * Read on-disk vhci cache into nvlists for all vhci classes.
9064  * Called during booting by i_ddi_read_devices_files().
9065  */
9066 void
9067 mdi_read_devices_files(void)
9068 {
9069 	int i;
9070 
9071 	for (i = 0; i < N_VHCI_CLASSES; i++)
9072 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9073 }
9074 
9075 /*
9076  * Remove all stale entries from vhci cache.
9077  */
9078 static void
9079 clean_vhcache(mdi_vhci_config_t *vhc)
9080 {
9081 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9082 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
9083 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
9084 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
9085 
9086 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9087 
9088 	cct_head = vhcache->vhcache_client_head;
9089 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9090 	for (cct = cct_head; cct != NULL; cct = cct_next) {
9091 		cct_next = cct->cct_next;
9092 
9093 		cpi_head = cct->cct_cpi_head;
9094 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
9095 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
9096 			cpi_next = cpi->cpi_next;
9097 			if (cpi->cpi_pip != NULL) {
9098 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
9099 				enqueue_tail_vhcache_pathinfo(cct, cpi);
9100 			} else
9101 				free_vhcache_pathinfo(cpi);
9102 		}
9103 
9104 		if (cct->cct_cpi_head != NULL)
9105 			enqueue_vhcache_client(vhcache, cct);
9106 		else {
9107 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9108 			    (mod_hash_key_t)cct->cct_name_addr);
9109 			free_vhcache_client(cct);
9110 		}
9111 	}
9112 
9113 	cphci_head = vhcache->vhcache_phci_head;
9114 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9115 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
9116 		cphci_next = cphci->cphci_next;
9117 		if (cphci->cphci_phci != NULL)
9118 			enqueue_vhcache_phci(vhcache, cphci);
9119 		else
9120 			free_vhcache_phci(cphci);
9121 	}
9122 
9123 	vhcache->vhcache_clean_time = lbolt64;
9124 	rw_exit(&vhcache->vhcache_lock);
9125 	vhcache_dirty(vhc);
9126 }
9127 
9128 /*
9129  * Remove all stale entries from vhci cache.
9130  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9131  */
9132 void
9133 mdi_clean_vhcache(void)
9134 {
9135 	mdi_vhci_t *vh;
9136 
9137 	mutex_enter(&mdi_mutex);
9138 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9139 		vh->vh_refcnt++;
9140 		mutex_exit(&mdi_mutex);
9141 		clean_vhcache(vh->vh_config);
9142 		mutex_enter(&mdi_mutex);
9143 		vh->vh_refcnt--;
9144 	}
9145 	mutex_exit(&mdi_mutex);
9146 }
9147 
9148 /*
9149  * mdi_vhci_walk_clients():
9150  *		Walker routine to traverse client dev_info nodes
9151  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9152  * below the client, including nexus devices, which we dont want.
9153  * So we just traverse the immediate siblings, starting from 1st client.
9154  */
9155 void
9156 mdi_vhci_walk_clients(dev_info_t *vdip,
9157     int (*f)(dev_info_t *, void *), void *arg)
9158 {
9159 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9160 	dev_info_t	*cdip;
9161 	mdi_client_t	*ct;
9162 
9163 	MDI_VHCI_CLIENT_LOCK(vh);
9164 	cdip = ddi_get_child(vdip);
9165 	while (cdip) {
9166 		ct = i_devi_get_client(cdip);
9167 		MDI_CLIENT_LOCK(ct);
9168 
9169 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9170 			cdip = ddi_get_next_sibling(cdip);
9171 		else
9172 			cdip = NULL;
9173 
9174 		MDI_CLIENT_UNLOCK(ct);
9175 	}
9176 	MDI_VHCI_CLIENT_UNLOCK(vh);
9177 }
9178 
9179 /*
9180  * mdi_vhci_walk_phcis():
9181  *		Walker routine to traverse phci dev_info nodes
9182  */
9183 void
9184 mdi_vhci_walk_phcis(dev_info_t *vdip,
9185     int (*f)(dev_info_t *, void *), void *arg)
9186 {
9187 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9188 	mdi_phci_t	*ph, *next;
9189 
9190 	MDI_VHCI_PHCI_LOCK(vh);
9191 	ph = vh->vh_phci_head;
9192 	while (ph) {
9193 		MDI_PHCI_LOCK(ph);
9194 
9195 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9196 			next = ph->ph_next;
9197 		else
9198 			next = NULL;
9199 
9200 		MDI_PHCI_UNLOCK(ph);
9201 		ph = next;
9202 	}
9203 	MDI_VHCI_PHCI_UNLOCK(vh);
9204 }
9205 
9206 
9207 /*
9208  * mdi_walk_vhcis():
9209  *		Walker routine to traverse vhci dev_info nodes
9210  */
9211 void
9212 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9213 {
9214 	mdi_vhci_t	*vh = NULL;
9215 
9216 	mutex_enter(&mdi_mutex);
9217 	/*
9218 	 * Scan for already registered vhci
9219 	 */
9220 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9221 		vh->vh_refcnt++;
9222 		mutex_exit(&mdi_mutex);
9223 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9224 			mutex_enter(&mdi_mutex);
9225 			vh->vh_refcnt--;
9226 			break;
9227 		} else {
9228 			mutex_enter(&mdi_mutex);
9229 			vh->vh_refcnt--;
9230 		}
9231 	}
9232 
9233 	mutex_exit(&mdi_mutex);
9234 }
9235 
9236 /*
9237  * i_mdi_log_sysevent():
9238  *		Logs events for pickup by syseventd
9239  */
9240 static void
9241 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9242 {
9243 	char		*path_name;
9244 	nvlist_t	*attr_list;
9245 
9246 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9247 	    KM_SLEEP) != DDI_SUCCESS) {
9248 		goto alloc_failed;
9249 	}
9250 
9251 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9252 	(void) ddi_pathname(dip, path_name);
9253 
9254 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9255 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9256 		goto error;
9257 	}
9258 
9259 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9260 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9261 		goto error;
9262 	}
9263 
9264 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9265 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9266 		goto error;
9267 	}
9268 
9269 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9270 	    path_name) != DDI_SUCCESS) {
9271 		goto error;
9272 	}
9273 
9274 	if (nvlist_add_string(attr_list, DDI_CLASS,
9275 	    ph_vh_class) != DDI_SUCCESS) {
9276 		goto error;
9277 	}
9278 
9279 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9280 	    attr_list, NULL, DDI_SLEEP);
9281 
9282 error:
9283 	kmem_free(path_name, MAXPATHLEN);
9284 	nvlist_free(attr_list);
9285 	return;
9286 
9287 alloc_failed:
9288 	MDI_DEBUG(1, (CE_WARN, dip,
9289 	    "!i_mdi_log_sysevent: Unable to send sysevent"));
9290 }
9291 
9292 char **
9293 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9294 {
9295 	char	**driver_list, **ret_driver_list = NULL;
9296 	int	*root_support_list;
9297 	int	cur_elements, max_elements;
9298 
9299 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9300 	    &cur_elements, &max_elements);
9301 
9302 
9303 	if (driver_list) {
9304 		kmem_free(root_support_list, sizeof (int) * max_elements);
9305 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9306 		    * max_elements, sizeof (char *) * cur_elements);
9307 	}
9308 	*ndrivers = cur_elements;
9309 
9310 	return (ret_driver_list);
9311 
9312 }
9313 
9314 void
9315 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9316 {
9317 	char	**p;
9318 	int	i;
9319 
9320 	if (driver_list) {
9321 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9322 			kmem_free(*p, strlen(*p) + 1);
9323 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9324 	}
9325 }
9326 
9327 /*
9328  * mdi_is_dev_supported():
9329  *		function called by pHCI bus config operation to determine if a
9330  *		device should be represented as a child of the vHCI or the
9331  *		pHCI.  This decision is made by the vHCI, using cinfo idenity
9332  *		information passed by the pHCI - specifics of the cinfo
9333  *		representation are by agreement between the pHCI and vHCI.
9334  * Return Values:
9335  *		MDI_SUCCESS
9336  *		MDI_FAILURE
9337  */
9338 int
9339 mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
9340 {
9341 	mdi_vhci_t	*vh;
9342 
9343 	ASSERT(class && pdip);
9344 
9345 	/*
9346 	 * For dev_supported, mdi_phci_register() must have established pdip as
9347 	 * a pHCI.
9348 	 *
9349 	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
9350 	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
9351 	 */
9352 	if (!MDI_PHCI(pdip))
9353 		return (MDI_FAILURE);
9354 
9355 	/* Return MDI_FAILURE if vHCI does not support asking the question. */
9356 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
9357 	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
9358 		return (MDI_FAILURE);
9359 	}
9360 
9361 	/* Return vHCI answer */
9362 	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
9363 }
9364 
9365 int
9366 mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
9367 {
9368 	uint_t devstate = 0;
9369 	dev_info_t *cdip;
9370 
9371 	if ((pip == NULL) || (dcp == NULL))
9372 		return (MDI_FAILURE);
9373 
9374 	cdip = mdi_pi_get_client(pip);
9375 
9376 	switch (mdi_pi_get_state(pip)) {
9377 	case MDI_PATHINFO_STATE_INIT:
9378 		devstate = DEVICE_DOWN;
9379 		break;
9380 	case MDI_PATHINFO_STATE_ONLINE:
9381 		devstate = DEVICE_ONLINE;
9382 		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
9383 			devstate |= DEVICE_BUSY;
9384 		break;
9385 	case MDI_PATHINFO_STATE_STANDBY:
9386 		devstate = DEVICE_ONLINE;
9387 		break;
9388 	case MDI_PATHINFO_STATE_FAULT:
9389 		devstate = DEVICE_DOWN;
9390 		break;
9391 	case MDI_PATHINFO_STATE_OFFLINE:
9392 		devstate = DEVICE_OFFLINE;
9393 		break;
9394 	default:
9395 		ASSERT(MDI_PI(pip)->pi_state);
9396 	}
9397 
9398 	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
9399 		return (MDI_FAILURE);
9400 
9401 	return (MDI_SUCCESS);
9402 }
9403