1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  * Copyright 2015 RackTop Systems.
26  * Copyright (c) 2016, Intel Corporation.
27  */
28 
29 /*
30  * Pool import support functions.
31  *
32  * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
33  * these commands are expected to run in the global zone, we can assume
34  * that the devices are all readable when called.
35  *
36  * To import a pool, we rely on reading the configuration information from the
37  * ZFS label of each device.  If we successfully read the label, then we
38  * organize the configuration information in the following hierarchy:
39  *
40  *	pool guid -> toplevel vdev guid -> label txg
41  *
42  * Duplicate entries matching this same tuple will be discarded.  Once we have
43  * examined every device, we pick the best label txg config for each toplevel
44  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
45  * update any paths that have changed.  Finally, we attempt to import the pool
46  * using our derived config, and record the results.
47  */
48 
49 #include <ctype.h>
50 #include <dirent.h>
51 #include <errno.h>
52 #include <libintl.h>
53 #include <libgen.h>
54 #include <stddef.h>
55 #include <stdlib.h>
56 #include <stdio.h>
57 #include <string.h>
58 #include <sys/stat.h>
59 #include <unistd.h>
60 #include <fcntl.h>
61 #include <sys/dktp/fdisk.h>
62 #include <sys/vdev_impl.h>
63 #include <sys/fs/zfs.h>
64 
65 #include <thread_pool.h>
66 #include <libzutil.h>
67 #include <libnvpair.h>
68 #include <libzfs.h>
69 
70 #include "zutil_import.h"
71 
72 #ifdef HAVE_LIBUDEV
73 #include <libudev.h>
74 #include <sched.h>
75 #endif
76 #include <blkid/blkid.h>
77 
78 #define	DEV_BYID_PATH	"/dev/disk/by-id/"
79 
80 /*
81  * Skip devices with well known prefixes:
82  * there can be side effects when opening devices which need to be avoided.
83  *
84  * hpet        - High Precision Event Timer
85  * watchdog[N] - Watchdog must be closed in a special way.
86  */
87 static boolean_t
88 should_skip_dev(const char *dev)
89 {
90 	return ((strcmp(dev, "watchdog") == 0) ||
91 	    (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) ||
92 	    (strcmp(dev, "hpet") == 0));
93 }
94 
95 int
96 zfs_dev_flush(int fd)
97 {
98 	return (ioctl(fd, BLKFLSBUF));
99 }
100 
101 void
102 zpool_open_func(void *arg)
103 {
104 	rdsk_node_t *rn = arg;
105 	libpc_handle_t *hdl = rn->rn_hdl;
106 	struct stat64 statbuf;
107 	nvlist_t *config;
108 	uint64_t vdev_guid = 0;
109 	int error;
110 	int num_labels = 0;
111 	int fd;
112 
113 	if (should_skip_dev(zfs_basename(rn->rn_name)))
114 		return;
115 
116 	/*
117 	 * Ignore failed stats.  We only want regular files and block devices.
118 	 * Ignore files that are too small to hold a zpool.
119 	 */
120 	if (stat64(rn->rn_name, &statbuf) != 0 ||
121 	    (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) ||
122 	    (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE))
123 		return;
124 
125 	/*
126 	 * Preferentially open using O_DIRECT to bypass the block device
127 	 * cache which may be stale for multipath devices.  An EINVAL errno
128 	 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
129 	 */
130 	fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC);
131 	if ((fd < 0) && (errno == EINVAL))
132 		fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC);
133 	if ((fd < 0) && (errno == EACCES))
134 		hdl->lpc_open_access_error = B_TRUE;
135 	if (fd < 0)
136 		return;
137 
138 	error = zpool_read_label(fd, &config, &num_labels);
139 	if (error != 0) {
140 		(void) close(fd);
141 		return;
142 	}
143 
144 	if (num_labels == 0) {
145 		(void) close(fd);
146 		nvlist_free(config);
147 		return;
148 	}
149 
150 	/*
151 	 * Check that the vdev is for the expected guid.  Additional entries
152 	 * are speculatively added based on the paths stored in the labels.
153 	 * Entries with valid paths but incorrect guids must be removed.
154 	 */
155 	error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
156 	if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
157 		(void) close(fd);
158 		nvlist_free(config);
159 		return;
160 	}
161 
162 	(void) close(fd);
163 
164 	rn->rn_config = config;
165 	rn->rn_num_labels = num_labels;
166 
167 	/*
168 	 * Add additional entries for paths described by this label.
169 	 */
170 	if (rn->rn_labelpaths) {
171 		const char *path = NULL;
172 		const char *devid = NULL;
173 		rdsk_node_t *slice;
174 		avl_index_t where;
175 		int error;
176 
177 		if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
178 			return;
179 
180 		/*
181 		 * Allow devlinks to stabilize so all paths are available.
182 		 */
183 		zpool_disk_wait(rn->rn_name);
184 
185 		if (path != NULL) {
186 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
187 			slice->rn_name = zutil_strdup(hdl, path);
188 			slice->rn_vdev_guid = vdev_guid;
189 			slice->rn_avl = rn->rn_avl;
190 			slice->rn_hdl = hdl;
191 			slice->rn_order = IMPORT_ORDER_PREFERRED_1;
192 			slice->rn_labelpaths = B_FALSE;
193 			pthread_mutex_lock(rn->rn_lock);
194 			if (avl_find(rn->rn_avl, slice, &where)) {
195 			pthread_mutex_unlock(rn->rn_lock);
196 				free(slice->rn_name);
197 				free(slice);
198 			} else {
199 				avl_insert(rn->rn_avl, slice, where);
200 				pthread_mutex_unlock(rn->rn_lock);
201 				zpool_open_func(slice);
202 			}
203 		}
204 
205 		if (devid != NULL) {
206 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
207 			error = asprintf(&slice->rn_name, "%s%s",
208 			    DEV_BYID_PATH, devid);
209 			if (error == -1) {
210 				free(slice);
211 				return;
212 			}
213 
214 			slice->rn_vdev_guid = vdev_guid;
215 			slice->rn_avl = rn->rn_avl;
216 			slice->rn_hdl = hdl;
217 			slice->rn_order = IMPORT_ORDER_PREFERRED_2;
218 			slice->rn_labelpaths = B_FALSE;
219 			pthread_mutex_lock(rn->rn_lock);
220 			if (avl_find(rn->rn_avl, slice, &where)) {
221 				pthread_mutex_unlock(rn->rn_lock);
222 				free(slice->rn_name);
223 				free(slice);
224 			} else {
225 				avl_insert(rn->rn_avl, slice, where);
226 				pthread_mutex_unlock(rn->rn_lock);
227 				zpool_open_func(slice);
228 			}
229 		}
230 	}
231 }
232 
233 static const char * const
234 zpool_default_import_path[] = {
235 	"/dev/disk/by-vdev",	/* Custom rules, use first if they exist */
236 	"/dev/mapper",		/* Use multipath devices before components */
237 	"/dev/disk/by-partlabel", /* Single unique entry set by user */
238 	"/dev/disk/by-partuuid", /* Generated partition uuid */
239 	"/dev/disk/by-label",	/* Custom persistent labels */
240 	"/dev/disk/by-uuid",	/* Single unique entry and persistent */
241 	"/dev/disk/by-id",	/* May be multiple entries and persistent */
242 	"/dev/disk/by-path",	/* Encodes physical location and persistent */
243 	"/dev"			/* UNSAFE device names will change */
244 };
245 
246 const char * const *
247 zpool_default_search_paths(size_t *count)
248 {
249 	*count = ARRAY_SIZE(zpool_default_import_path);
250 	return (zpool_default_import_path);
251 }
252 
253 /*
254  * Given a full path to a device determine if that device appears in the
255  * import search path.  If it does return the first match and store the
256  * index in the passed 'order' variable, otherwise return an error.
257  */
258 static int
259 zfs_path_order(const char *name, int *order)
260 {
261 	const char *env = getenv("ZPOOL_IMPORT_PATH");
262 
263 	if (env) {
264 		for (int i = 0; ; ++i) {
265 			env += strspn(env, ":");
266 			size_t dirlen = strcspn(env, ":");
267 			if (dirlen) {
268 				if (strncmp(name, env, dirlen) == 0) {
269 					*order = i;
270 					return (0);
271 				}
272 
273 				env += dirlen;
274 			} else
275 				break;
276 		}
277 	} else {
278 		for (int i = 0; i < ARRAY_SIZE(zpool_default_import_path);
279 		    ++i) {
280 			if (strncmp(name, zpool_default_import_path[i],
281 			    strlen(zpool_default_import_path[i])) == 0) {
282 				*order = i;
283 				return (0);
284 			}
285 		}
286 	}
287 
288 	return (ENOENT);
289 }
290 
291 /*
292  * Use libblkid to quickly enumerate all known zfs devices.
293  */
294 int
295 zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
296     avl_tree_t **slice_cache)
297 {
298 	rdsk_node_t *slice;
299 	blkid_cache cache;
300 	blkid_dev_iterate iter;
301 	blkid_dev dev;
302 	avl_index_t where;
303 	int error;
304 
305 	*slice_cache = NULL;
306 
307 	error = blkid_get_cache(&cache, NULL);
308 	if (error != 0)
309 		return (error);
310 
311 	error = blkid_probe_all_new(cache);
312 	if (error != 0) {
313 		blkid_put_cache(cache);
314 		return (error);
315 	}
316 
317 	iter = blkid_dev_iterate_begin(cache);
318 	if (iter == NULL) {
319 		blkid_put_cache(cache);
320 		return (EINVAL);
321 	}
322 
323 	/* Only const char *s since 2.32 */
324 	error = blkid_dev_set_search(iter,
325 	    (char *)"TYPE", (char *)"zfs_member");
326 	if (error != 0) {
327 		blkid_dev_iterate_end(iter);
328 		blkid_put_cache(cache);
329 		return (error);
330 	}
331 
332 	*slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t));
333 	avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
334 	    offsetof(rdsk_node_t, rn_node));
335 
336 	while (blkid_dev_next(iter, &dev) == 0) {
337 		slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
338 		slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev));
339 		slice->rn_vdev_guid = 0;
340 		slice->rn_lock = lock;
341 		slice->rn_avl = *slice_cache;
342 		slice->rn_hdl = hdl;
343 		slice->rn_labelpaths = B_TRUE;
344 
345 		error = zfs_path_order(slice->rn_name, &slice->rn_order);
346 		if (error == 0)
347 			slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
348 		else
349 			slice->rn_order = IMPORT_ORDER_DEFAULT;
350 
351 		pthread_mutex_lock(lock);
352 		if (avl_find(*slice_cache, slice, &where)) {
353 			free(slice->rn_name);
354 			free(slice);
355 		} else {
356 			avl_insert(*slice_cache, slice, where);
357 		}
358 		pthread_mutex_unlock(lock);
359 	}
360 
361 	blkid_dev_iterate_end(iter);
362 	blkid_put_cache(cache);
363 
364 	return (0);
365 }
366 
367 /*
368  * Linux persistent device strings for vdev labels
369  *
370  * based on libudev for consistency with libudev disk add/remove events
371  */
372 
373 typedef struct vdev_dev_strs {
374 	char	vds_devid[128];
375 	char	vds_devphys[128];
376 } vdev_dev_strs_t;
377 
378 #ifdef HAVE_LIBUDEV
379 
380 /*
381  * Obtain the persistent device id string (describes what)
382  *
383  * used by ZED vdev matching for auto-{online,expand,replace}
384  */
385 int
386 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
387 {
388 	struct udev_list_entry *entry;
389 	const char *bus;
390 	char devbyid[MAXPATHLEN];
391 
392 	/* The bus based by-id path is preferred */
393 	bus = udev_device_get_property_value(dev, "ID_BUS");
394 
395 	if (bus == NULL) {
396 		const char *dm_uuid;
397 
398 		/*
399 		 * For multipath nodes use the persistent uuid based identifier
400 		 *
401 		 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
402 		 */
403 		dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
404 		if (dm_uuid != NULL) {
405 			(void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
406 			return (0);
407 		}
408 
409 		/*
410 		 * For volumes use the persistent /dev/zvol/dataset identifier
411 		 */
412 		entry = udev_device_get_devlinks_list_entry(dev);
413 		while (entry != NULL) {
414 			const char *name;
415 
416 			name = udev_list_entry_get_name(entry);
417 			if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
418 				(void) strlcpy(bufptr, name, buflen);
419 				return (0);
420 			}
421 			entry = udev_list_entry_get_next(entry);
422 		}
423 
424 		/*
425 		 * NVME 'by-id' symlinks are similar to bus case
426 		 */
427 		struct udev_device *parent;
428 
429 		parent = udev_device_get_parent_with_subsystem_devtype(dev,
430 		    "nvme", NULL);
431 		if (parent != NULL)
432 			bus = "nvme";	/* continue with bus symlink search */
433 		else
434 			return (ENODATA);
435 	}
436 
437 	/*
438 	 * locate the bus specific by-id link
439 	 */
440 	(void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
441 	entry = udev_device_get_devlinks_list_entry(dev);
442 	while (entry != NULL) {
443 		const char *name;
444 
445 		name = udev_list_entry_get_name(entry);
446 		if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
447 			name += strlen(DEV_BYID_PATH);
448 			(void) strlcpy(bufptr, name, buflen);
449 			return (0);
450 		}
451 		entry = udev_list_entry_get_next(entry);
452 	}
453 
454 	return (ENODATA);
455 }
456 
457 /*
458  * Obtain the persistent physical location string (describes where)
459  *
460  * used by ZED vdev matching for auto-{online,expand,replace}
461  */
462 int
463 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
464 {
465 	const char *physpath = NULL;
466 	struct udev_list_entry *entry;
467 
468 	/*
469 	 * Normal disks use ID_PATH for their physical path.
470 	 */
471 	physpath = udev_device_get_property_value(dev, "ID_PATH");
472 	if (physpath != NULL && strlen(physpath) > 0) {
473 		(void) strlcpy(bufptr, physpath, buflen);
474 		return (0);
475 	}
476 
477 	/*
478 	 * Device mapper devices are virtual and don't have a physical
479 	 * path. For them we use ID_VDEV instead, which is setup via the
480 	 * /etc/vdev_id.conf file.  ID_VDEV provides a persistent path
481 	 * to a virtual device.  If you don't have vdev_id.conf setup,
482 	 * you cannot use multipath autoreplace with device mapper.
483 	 */
484 	physpath = udev_device_get_property_value(dev, "ID_VDEV");
485 	if (physpath != NULL && strlen(physpath) > 0) {
486 		(void) strlcpy(bufptr, physpath, buflen);
487 		return (0);
488 	}
489 
490 	/*
491 	 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
492 	 */
493 	entry = udev_device_get_devlinks_list_entry(dev);
494 	while (entry != NULL) {
495 		physpath = udev_list_entry_get_name(entry);
496 		if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
497 			(void) strlcpy(bufptr, physpath, buflen);
498 			return (0);
499 		}
500 		entry = udev_list_entry_get_next(entry);
501 	}
502 
503 	/*
504 	 * For all other devices fallback to using the by-uuid name.
505 	 */
506 	entry = udev_device_get_devlinks_list_entry(dev);
507 	while (entry != NULL) {
508 		physpath = udev_list_entry_get_name(entry);
509 		if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
510 			(void) strlcpy(bufptr, physpath, buflen);
511 			return (0);
512 		}
513 		entry = udev_list_entry_get_next(entry);
514 	}
515 
516 	return (ENODATA);
517 }
518 
519 /*
520  * A disk is considered a multipath whole disk when:
521  *	DEVNAME key value has "dm-"
522  *	DM_NAME key value has "mpath" prefix
523  *	DM_UUID key exists
524  *	ID_PART_TABLE_TYPE key does not exist or is not gpt
525  */
526 static boolean_t
527 udev_mpath_whole_disk(struct udev_device *dev)
528 {
529 	const char *devname, *type, *uuid;
530 
531 	devname = udev_device_get_property_value(dev, "DEVNAME");
532 	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
533 	uuid = udev_device_get_property_value(dev, "DM_UUID");
534 
535 	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
536 	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
537 	    (uuid != NULL)) {
538 		return (B_TRUE);
539 	}
540 
541 	return (B_FALSE);
542 }
543 
544 static int
545 udev_device_is_ready(struct udev_device *dev)
546 {
547 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
548 	return (udev_device_get_is_initialized(dev));
549 #else
550 	/* wait for DEVLINKS property to be initialized */
551 	return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
552 #endif
553 }
554 
555 #else
556 
557 int
558 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
559 {
560 	(void) dev, (void) bufptr, (void) buflen;
561 	return (ENODATA);
562 }
563 
564 int
565 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
566 {
567 	(void) dev, (void) bufptr, (void) buflen;
568 	return (ENODATA);
569 }
570 
571 #endif /* HAVE_LIBUDEV */
572 
573 /*
574  * Wait up to timeout_ms for udev to set up the device node.  The device is
575  * considered ready when libudev determines it has been initialized, all of
576  * the device links have been verified to exist, and it has been allowed to
577  * settle.  At this point the device can be accessed reliably. Depending on
578  * the complexity of the udev rules this process could take several seconds.
579  */
580 int
581 zpool_label_disk_wait(const char *path, int timeout_ms)
582 {
583 #ifdef HAVE_LIBUDEV
584 	struct udev *udev;
585 	struct udev_device *dev = NULL;
586 	char nodepath[MAXPATHLEN];
587 	char *sysname = NULL;
588 	int ret = ENODEV;
589 	int settle_ms = 50;
590 	long sleep_ms = 10;
591 	hrtime_t start, settle;
592 
593 	if ((udev = udev_new()) == NULL)
594 		return (ENXIO);
595 
596 	start = gethrtime();
597 	settle = 0;
598 
599 	do {
600 		if (sysname == NULL) {
601 			if (realpath(path, nodepath) != NULL) {
602 				sysname = strrchr(nodepath, '/') + 1;
603 			} else {
604 				(void) usleep(sleep_ms * MILLISEC);
605 				continue;
606 			}
607 		}
608 
609 		dev = udev_device_new_from_subsystem_sysname(udev,
610 		    "block", sysname);
611 		if ((dev != NULL) && udev_device_is_ready(dev)) {
612 			struct udev_list_entry *links, *link = NULL;
613 
614 			ret = 0;
615 			links = udev_device_get_devlinks_list_entry(dev);
616 
617 			udev_list_entry_foreach(link, links) {
618 				struct stat64 statbuf;
619 				const char *name;
620 
621 				name = udev_list_entry_get_name(link);
622 				errno = 0;
623 				if (stat64(name, &statbuf) == 0 && errno == 0)
624 					continue;
625 
626 				settle = 0;
627 				ret = ENODEV;
628 				break;
629 			}
630 
631 			if (ret == 0) {
632 				if (settle == 0) {
633 					settle = gethrtime();
634 				} else if (NSEC2MSEC(gethrtime() - settle) >=
635 				    settle_ms) {
636 					udev_device_unref(dev);
637 					break;
638 				}
639 			}
640 		}
641 
642 		udev_device_unref(dev);
643 		(void) usleep(sleep_ms * MILLISEC);
644 
645 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
646 
647 	udev_unref(udev);
648 
649 	return (ret);
650 #else
651 	int settle_ms = 50;
652 	long sleep_ms = 10;
653 	hrtime_t start, settle;
654 	struct stat64 statbuf;
655 
656 	start = gethrtime();
657 	settle = 0;
658 
659 	do {
660 		errno = 0;
661 		if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
662 			if (settle == 0)
663 				settle = gethrtime();
664 			else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
665 				return (0);
666 		} else if (errno != ENOENT) {
667 			return (errno);
668 		}
669 
670 		usleep(sleep_ms * MILLISEC);
671 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
672 
673 	return (ENODEV);
674 #endif /* HAVE_LIBUDEV */
675 }
676 
677 /*
678  * Simplified version of zpool_label_disk_wait() where we wait for a device
679  * to appear using the default timeouts.
680  */
681 int
682 zpool_disk_wait(const char *path)
683 {
684 	int timeout;
685 	timeout = zpool_getenv_int("ZPOOL_IMPORT_UDEV_TIMEOUT_MS",
686 	    DISK_LABEL_WAIT);
687 
688 	return (zpool_label_disk_wait(path, timeout));
689 }
690 
691 /*
692  * Encode the persistent devices strings
693  * used for the vdev disk label
694  */
695 static int
696 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
697     boolean_t wholedisk)
698 {
699 #ifdef HAVE_LIBUDEV
700 	struct udev *udev;
701 	struct udev_device *dev = NULL;
702 	char nodepath[MAXPATHLEN];
703 	char *sysname;
704 	int ret = ENODEV;
705 	hrtime_t start;
706 
707 	if ((udev = udev_new()) == NULL)
708 		return (ENXIO);
709 
710 	/* resolve path to a runtime device node instance */
711 	if (realpath(path, nodepath) == NULL)
712 		goto no_dev;
713 
714 	sysname = strrchr(nodepath, '/') + 1;
715 
716 	/*
717 	 * Wait up to 3 seconds for udev to set up the device node context
718 	 */
719 	start = gethrtime();
720 	do {
721 		dev = udev_device_new_from_subsystem_sysname(udev, "block",
722 		    sysname);
723 		if (dev == NULL)
724 			goto no_dev;
725 		if (udev_device_is_ready(dev))
726 			break;  /* udev ready */
727 
728 		udev_device_unref(dev);
729 		dev = NULL;
730 
731 		if (NSEC2MSEC(gethrtime() - start) < 10)
732 			(void) sched_yield();	/* yield/busy wait up to 10ms */
733 		else
734 			(void) usleep(10 * MILLISEC);
735 
736 	} while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
737 
738 	if (dev == NULL)
739 		goto no_dev;
740 
741 	/*
742 	 * Only whole disks require extra device strings
743 	 */
744 	if (!wholedisk && !udev_mpath_whole_disk(dev))
745 		goto no_dev;
746 
747 	ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
748 	if (ret != 0)
749 		goto no_dev_ref;
750 
751 	/* physical location string (optional) */
752 	if (zfs_device_get_physical(dev, ds->vds_devphys,
753 	    sizeof (ds->vds_devphys)) != 0) {
754 		ds->vds_devphys[0] = '\0'; /* empty string --> not available */
755 	}
756 
757 no_dev_ref:
758 	udev_device_unref(dev);
759 no_dev:
760 	udev_unref(udev);
761 
762 	return (ret);
763 #else
764 	(void) path;
765 	(void) ds;
766 	(void) wholedisk;
767 	return (ENOENT);
768 #endif
769 }
770 
771 /*
772  * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it
773  * in the nvlist * (if applicable).  Like:
774  *    vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
775  *
776  * If an old path was in the nvlist, and the rescan can not find a new path,
777  * then keep the old path, since the disk may have been removed.
778  *
779  * path: The vdev path (value from ZPOOL_CONFIG_PATH)
780  * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH)
781  */
782 void
783 update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path,
784     const char *key)
785 {
786 	char *upath, *spath;
787 	const char *oldpath = NULL;
788 
789 	(void) nvlist_lookup_string(nv, key, &oldpath);
790 
791 	/* Add enclosure sysfs path (if disk is in an enclosure). */
792 	upath = zfs_get_underlying_path(path);
793 	spath = zfs_get_enclosure_sysfs_path(upath);
794 
795 	if (spath) {
796 		(void) nvlist_add_string(nv, key, spath);
797 	} else {
798 		/*
799 		 * We couldn't dynamically scan the disk's enclosure sysfs path.
800 		 * This could be because the disk went away.  If there's an old
801 		 * enclosure sysfs path in the nvlist, then keep using it.
802 		 */
803 		if (!oldpath) {
804 			(void) nvlist_remove_all(nv, key);
805 		}
806 	}
807 
808 	free(upath);
809 	free(spath);
810 }
811 
812 /*
813  * This will get called for each leaf vdev.
814  */
815 static int
816 sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data)
817 {
818 	(void) hdl_data, (void) data;
819 
820 	const char *path = NULL;
821 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
822 		return (1);
823 
824 	/* Rescan our enclosure sysfs path for this vdev */
825 	update_vdev_config_dev_sysfs_path(nv, path,
826 	    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
827 	return (0);
828 }
829 
830 /*
831  * Given an nvlist for our pool (with vdev tree), iterate over all the
832  * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH.
833  */
834 void
835 update_vdevs_config_dev_sysfs_path(nvlist_t *config)
836 {
837 	nvlist_t *nvroot = NULL;
838 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
839 	    &nvroot) == 0);
840 	for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL);
841 }
842 
843 /*
844  * Update a leaf vdev's persistent device strings
845  *
846  * - only applies for a dedicated leaf vdev (aka whole disk)
847  * - updated during pool create|add|attach|import
848  * - used for matching device matching during auto-{online,expand,replace}
849  * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
850  * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
851  *
852  * single device node example:
853  * 	devid:		'scsi-MG03SCA300_350000494a8cb3d67-part1'
854  * 	phys_path:	'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
855  *
856  * multipath device node example:
857  * 	devid:		'dm-uuid-mpath-35000c5006304de3f'
858  *
859  * We also store the enclosure sysfs path for turning on enclosure LEDs
860  * (if applicable):
861  *	vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
862  */
863 void
864 update_vdev_config_dev_strs(nvlist_t *nv)
865 {
866 	vdev_dev_strs_t vds;
867 	const char *env, *type, *path;
868 	uint64_t wholedisk = 0;
869 
870 	/*
871 	 * For the benefit of legacy ZFS implementations, allow
872 	 * for opting out of devid strings in the vdev label.
873 	 *
874 	 * example use:
875 	 *	env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
876 	 *
877 	 * explanation:
878 	 * Older OpenZFS implementations had issues when attempting to
879 	 * display pool config VDEV names if a "devid" NVP value is
880 	 * present in the pool's config.
881 	 *
882 	 * For example, a pool that originated on illumos platform would
883 	 * have a devid value in the config and "zpool status" would fail
884 	 * when listing the config.
885 	 *
886 	 * A pool can be stripped of any "devid" values on import or
887 	 * prevented from adding them on zpool create|add by setting
888 	 * ZFS_VDEV_DEVID_OPT_OUT.
889 	 */
890 	env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
891 	if (env && (strtoul(env, NULL, 0) > 0 ||
892 	    !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
893 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
894 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
895 		return;
896 	}
897 
898 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
899 	    strcmp(type, VDEV_TYPE_DISK) != 0) {
900 		return;
901 	}
902 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
903 		return;
904 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
905 
906 	/*
907 	 * Update device string values in the config nvlist.
908 	 */
909 	if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
910 		(void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
911 		if (vds.vds_devphys[0] != '\0') {
912 			(void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
913 			    vds.vds_devphys);
914 		}
915 		update_vdev_config_dev_sysfs_path(nv, path,
916 		    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
917 	} else {
918 		/* Clear out any stale entries. */
919 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
920 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
921 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
922 	}
923 }
924