1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  * Copyright 2015 RackTop Systems.
26  * Copyright (c) 2016, Intel Corporation.
27  */
28 
29 /*
30  * Pool import support functions.
31  *
32  * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
33  * these commands are expected to run in the global zone, we can assume
34  * that the devices are all readable when called.
35  *
36  * To import a pool, we rely on reading the configuration information from the
37  * ZFS label of each device.  If we successfully read the label, then we
38  * organize the configuration information in the following hierarchy:
39  *
40  *	pool guid -> toplevel vdev guid -> label txg
41  *
42  * Duplicate entries matching this same tuple will be discarded.  Once we have
43  * examined every device, we pick the best label txg config for each toplevel
44  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
45  * update any paths that have changed.  Finally, we attempt to import the pool
46  * using our derived config, and record the results.
47  */
48 
49 #include <ctype.h>
50 #include <dirent.h>
51 #include <errno.h>
52 #include <libintl.h>
53 #include <libgen.h>
54 #include <stddef.h>
55 #include <stdlib.h>
56 #include <stdio.h>
57 #include <string.h>
58 #include <sys/stat.h>
59 #include <unistd.h>
60 #include <fcntl.h>
61 #include <sys/dktp/fdisk.h>
62 #include <sys/vdev_impl.h>
63 #include <sys/fs/zfs.h>
64 
65 #include <thread_pool.h>
66 #include <libzutil.h>
67 #include <libnvpair.h>
68 #include <libzfs.h>
69 
70 #include "zutil_import.h"
71 
72 #ifdef HAVE_LIBUDEV
73 #include <libudev.h>
74 #include <sched.h>
75 #endif
76 #include <blkid/blkid.h>
77 
78 #define	DEV_BYID_PATH	"/dev/disk/by-id/"
79 
80 /*
81  * Skip devices with well known prefixes:
82  * there can be side effects when opening devices which need to be avoided.
83  *
84  * hpet        - High Precision Event Timer
85  * watchdog[N] - Watchdog must be closed in a special way.
86  */
87 static boolean_t
88 should_skip_dev(const char *dev)
89 {
90 	return ((strcmp(dev, "watchdog") == 0) ||
91 	    (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) ||
92 	    (strcmp(dev, "hpet") == 0));
93 }
94 
95 int
96 zfs_dev_flush(int fd)
97 {
98 	return (ioctl(fd, BLKFLSBUF));
99 }
100 
101 void
102 zpool_open_func(void *arg)
103 {
104 	rdsk_node_t *rn = arg;
105 	libpc_handle_t *hdl = rn->rn_hdl;
106 	struct stat64 statbuf;
107 	nvlist_t *config;
108 	uint64_t vdev_guid = 0;
109 	int error;
110 	int num_labels = 0;
111 	int fd;
112 
113 	if (should_skip_dev(zfs_basename(rn->rn_name)))
114 		return;
115 
116 	/*
117 	 * Ignore failed stats.  We only want regular files and block devices.
118 	 * Ignore files that are too small to hold a zpool.
119 	 */
120 	if (stat64(rn->rn_name, &statbuf) != 0 ||
121 	    (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) ||
122 	    (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE))
123 		return;
124 
125 	/*
126 	 * Preferentially open using O_DIRECT to bypass the block device
127 	 * cache which may be stale for multipath devices.  An EINVAL errno
128 	 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
129 	 */
130 	fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC);
131 	if ((fd < 0) && (errno == EINVAL))
132 		fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC);
133 	if ((fd < 0) && (errno == EACCES))
134 		hdl->lpc_open_access_error = B_TRUE;
135 	if (fd < 0)
136 		return;
137 
138 	error = zpool_read_label(fd, &config, &num_labels);
139 	if (error != 0) {
140 		(void) close(fd);
141 		return;
142 	}
143 
144 	if (num_labels == 0) {
145 		(void) close(fd);
146 		nvlist_free(config);
147 		return;
148 	}
149 
150 	/*
151 	 * Check that the vdev is for the expected guid.  Additional entries
152 	 * are speculatively added based on the paths stored in the labels.
153 	 * Entries with valid paths but incorrect guids must be removed.
154 	 */
155 	error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
156 	if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
157 		(void) close(fd);
158 		nvlist_free(config);
159 		return;
160 	}
161 
162 	(void) close(fd);
163 
164 	rn->rn_config = config;
165 	rn->rn_num_labels = num_labels;
166 
167 	/*
168 	 * Add additional entries for paths described by this label.
169 	 */
170 	if (rn->rn_labelpaths) {
171 		const char *path = NULL;
172 		const char *devid = NULL;
173 		const char *env = NULL;
174 		rdsk_node_t *slice;
175 		avl_index_t where;
176 		int timeout;
177 		int error;
178 
179 		if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
180 			return;
181 
182 		env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS");
183 		if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 ||
184 		    timeout < 0) {
185 			timeout = DISK_LABEL_WAIT;
186 		}
187 
188 		/*
189 		 * Allow devlinks to stabilize so all paths are available.
190 		 */
191 		zpool_label_disk_wait(rn->rn_name, timeout);
192 
193 		if (path != NULL) {
194 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
195 			slice->rn_name = zutil_strdup(hdl, path);
196 			slice->rn_vdev_guid = vdev_guid;
197 			slice->rn_avl = rn->rn_avl;
198 			slice->rn_hdl = hdl;
199 			slice->rn_order = IMPORT_ORDER_PREFERRED_1;
200 			slice->rn_labelpaths = B_FALSE;
201 			pthread_mutex_lock(rn->rn_lock);
202 			if (avl_find(rn->rn_avl, slice, &where)) {
203 			pthread_mutex_unlock(rn->rn_lock);
204 				free(slice->rn_name);
205 				free(slice);
206 			} else {
207 				avl_insert(rn->rn_avl, slice, where);
208 				pthread_mutex_unlock(rn->rn_lock);
209 				zpool_open_func(slice);
210 			}
211 		}
212 
213 		if (devid != NULL) {
214 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
215 			error = asprintf(&slice->rn_name, "%s%s",
216 			    DEV_BYID_PATH, devid);
217 			if (error == -1) {
218 				free(slice);
219 				return;
220 			}
221 
222 			slice->rn_vdev_guid = vdev_guid;
223 			slice->rn_avl = rn->rn_avl;
224 			slice->rn_hdl = hdl;
225 			slice->rn_order = IMPORT_ORDER_PREFERRED_2;
226 			slice->rn_labelpaths = B_FALSE;
227 			pthread_mutex_lock(rn->rn_lock);
228 			if (avl_find(rn->rn_avl, slice, &where)) {
229 				pthread_mutex_unlock(rn->rn_lock);
230 				free(slice->rn_name);
231 				free(slice);
232 			} else {
233 				avl_insert(rn->rn_avl, slice, where);
234 				pthread_mutex_unlock(rn->rn_lock);
235 				zpool_open_func(slice);
236 			}
237 		}
238 	}
239 }
240 
241 static const char * const
242 zpool_default_import_path[] = {
243 	"/dev/disk/by-vdev",	/* Custom rules, use first if they exist */
244 	"/dev/mapper",		/* Use multipath devices before components */
245 	"/dev/disk/by-partlabel", /* Single unique entry set by user */
246 	"/dev/disk/by-partuuid", /* Generated partition uuid */
247 	"/dev/disk/by-label",	/* Custom persistent labels */
248 	"/dev/disk/by-uuid",	/* Single unique entry and persistent */
249 	"/dev/disk/by-id",	/* May be multiple entries and persistent */
250 	"/dev/disk/by-path",	/* Encodes physical location and persistent */
251 	"/dev"			/* UNSAFE device names will change */
252 };
253 
254 const char * const *
255 zpool_default_search_paths(size_t *count)
256 {
257 	*count = ARRAY_SIZE(zpool_default_import_path);
258 	return (zpool_default_import_path);
259 }
260 
261 /*
262  * Given a full path to a device determine if that device appears in the
263  * import search path.  If it does return the first match and store the
264  * index in the passed 'order' variable, otherwise return an error.
265  */
266 static int
267 zfs_path_order(const char *name, int *order)
268 {
269 	const char *env = getenv("ZPOOL_IMPORT_PATH");
270 
271 	if (env) {
272 		for (int i = 0; ; ++i) {
273 			env += strspn(env, ":");
274 			size_t dirlen = strcspn(env, ":");
275 			if (dirlen) {
276 				if (strncmp(name, env, dirlen) == 0) {
277 					*order = i;
278 					return (0);
279 				}
280 
281 				env += dirlen;
282 			} else
283 				break;
284 		}
285 	} else {
286 		for (int i = 0; i < ARRAY_SIZE(zpool_default_import_path);
287 		    ++i) {
288 			if (strncmp(name, zpool_default_import_path[i],
289 			    strlen(zpool_default_import_path[i])) == 0) {
290 				*order = i;
291 				return (0);
292 			}
293 		}
294 	}
295 
296 	return (ENOENT);
297 }
298 
299 /*
300  * Use libblkid to quickly enumerate all known zfs devices.
301  */
302 int
303 zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
304     avl_tree_t **slice_cache)
305 {
306 	rdsk_node_t *slice;
307 	blkid_cache cache;
308 	blkid_dev_iterate iter;
309 	blkid_dev dev;
310 	avl_index_t where;
311 	int error;
312 
313 	*slice_cache = NULL;
314 
315 	error = blkid_get_cache(&cache, NULL);
316 	if (error != 0)
317 		return (error);
318 
319 	error = blkid_probe_all_new(cache);
320 	if (error != 0) {
321 		blkid_put_cache(cache);
322 		return (error);
323 	}
324 
325 	iter = blkid_dev_iterate_begin(cache);
326 	if (iter == NULL) {
327 		blkid_put_cache(cache);
328 		return (EINVAL);
329 	}
330 
331 	/* Only const char *s since 2.32 */
332 	error = blkid_dev_set_search(iter,
333 	    (char *)"TYPE", (char *)"zfs_member");
334 	if (error != 0) {
335 		blkid_dev_iterate_end(iter);
336 		blkid_put_cache(cache);
337 		return (error);
338 	}
339 
340 	*slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t));
341 	avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
342 	    offsetof(rdsk_node_t, rn_node));
343 
344 	while (blkid_dev_next(iter, &dev) == 0) {
345 		slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
346 		slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev));
347 		slice->rn_vdev_guid = 0;
348 		slice->rn_lock = lock;
349 		slice->rn_avl = *slice_cache;
350 		slice->rn_hdl = hdl;
351 		slice->rn_labelpaths = B_TRUE;
352 
353 		error = zfs_path_order(slice->rn_name, &slice->rn_order);
354 		if (error == 0)
355 			slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
356 		else
357 			slice->rn_order = IMPORT_ORDER_DEFAULT;
358 
359 		pthread_mutex_lock(lock);
360 		if (avl_find(*slice_cache, slice, &where)) {
361 			free(slice->rn_name);
362 			free(slice);
363 		} else {
364 			avl_insert(*slice_cache, slice, where);
365 		}
366 		pthread_mutex_unlock(lock);
367 	}
368 
369 	blkid_dev_iterate_end(iter);
370 	blkid_put_cache(cache);
371 
372 	return (0);
373 }
374 
375 /*
376  * Linux persistent device strings for vdev labels
377  *
378  * based on libudev for consistency with libudev disk add/remove events
379  */
380 
381 typedef struct vdev_dev_strs {
382 	char	vds_devid[128];
383 	char	vds_devphys[128];
384 } vdev_dev_strs_t;
385 
386 #ifdef HAVE_LIBUDEV
387 
388 /*
389  * Obtain the persistent device id string (describes what)
390  *
391  * used by ZED vdev matching for auto-{online,expand,replace}
392  */
393 int
394 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
395 {
396 	struct udev_list_entry *entry;
397 	const char *bus;
398 	char devbyid[MAXPATHLEN];
399 
400 	/* The bus based by-id path is preferred */
401 	bus = udev_device_get_property_value(dev, "ID_BUS");
402 
403 	if (bus == NULL) {
404 		const char *dm_uuid;
405 
406 		/*
407 		 * For multipath nodes use the persistent uuid based identifier
408 		 *
409 		 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
410 		 */
411 		dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
412 		if (dm_uuid != NULL) {
413 			(void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
414 			return (0);
415 		}
416 
417 		/*
418 		 * For volumes use the persistent /dev/zvol/dataset identifier
419 		 */
420 		entry = udev_device_get_devlinks_list_entry(dev);
421 		while (entry != NULL) {
422 			const char *name;
423 
424 			name = udev_list_entry_get_name(entry);
425 			if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
426 				(void) strlcpy(bufptr, name, buflen);
427 				return (0);
428 			}
429 			entry = udev_list_entry_get_next(entry);
430 		}
431 
432 		/*
433 		 * NVME 'by-id' symlinks are similar to bus case
434 		 */
435 		struct udev_device *parent;
436 
437 		parent = udev_device_get_parent_with_subsystem_devtype(dev,
438 		    "nvme", NULL);
439 		if (parent != NULL)
440 			bus = "nvme";	/* continue with bus symlink search */
441 		else
442 			return (ENODATA);
443 	}
444 
445 	/*
446 	 * locate the bus specific by-id link
447 	 */
448 	(void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
449 	entry = udev_device_get_devlinks_list_entry(dev);
450 	while (entry != NULL) {
451 		const char *name;
452 
453 		name = udev_list_entry_get_name(entry);
454 		if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
455 			name += strlen(DEV_BYID_PATH);
456 			(void) strlcpy(bufptr, name, buflen);
457 			return (0);
458 		}
459 		entry = udev_list_entry_get_next(entry);
460 	}
461 
462 	return (ENODATA);
463 }
464 
465 /*
466  * Obtain the persistent physical location string (describes where)
467  *
468  * used by ZED vdev matching for auto-{online,expand,replace}
469  */
470 int
471 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
472 {
473 	const char *physpath = NULL;
474 	struct udev_list_entry *entry;
475 
476 	/*
477 	 * Normal disks use ID_PATH for their physical path.
478 	 */
479 	physpath = udev_device_get_property_value(dev, "ID_PATH");
480 	if (physpath != NULL && strlen(physpath) > 0) {
481 		(void) strlcpy(bufptr, physpath, buflen);
482 		return (0);
483 	}
484 
485 	/*
486 	 * Device mapper devices are virtual and don't have a physical
487 	 * path. For them we use ID_VDEV instead, which is setup via the
488 	 * /etc/vdev_id.conf file.  ID_VDEV provides a persistent path
489 	 * to a virtual device.  If you don't have vdev_id.conf setup,
490 	 * you cannot use multipath autoreplace with device mapper.
491 	 */
492 	physpath = udev_device_get_property_value(dev, "ID_VDEV");
493 	if (physpath != NULL && strlen(physpath) > 0) {
494 		(void) strlcpy(bufptr, physpath, buflen);
495 		return (0);
496 	}
497 
498 	/*
499 	 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
500 	 */
501 	entry = udev_device_get_devlinks_list_entry(dev);
502 	while (entry != NULL) {
503 		physpath = udev_list_entry_get_name(entry);
504 		if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
505 			(void) strlcpy(bufptr, physpath, buflen);
506 			return (0);
507 		}
508 		entry = udev_list_entry_get_next(entry);
509 	}
510 
511 	/*
512 	 * For all other devices fallback to using the by-uuid name.
513 	 */
514 	entry = udev_device_get_devlinks_list_entry(dev);
515 	while (entry != NULL) {
516 		physpath = udev_list_entry_get_name(entry);
517 		if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
518 			(void) strlcpy(bufptr, physpath, buflen);
519 			return (0);
520 		}
521 		entry = udev_list_entry_get_next(entry);
522 	}
523 
524 	return (ENODATA);
525 }
526 
527 /*
528  * A disk is considered a multipath whole disk when:
529  *	DEVNAME key value has "dm-"
530  *	DM_NAME key value has "mpath" prefix
531  *	DM_UUID key exists
532  *	ID_PART_TABLE_TYPE key does not exist or is not gpt
533  */
534 static boolean_t
535 udev_mpath_whole_disk(struct udev_device *dev)
536 {
537 	const char *devname, *type, *uuid;
538 
539 	devname = udev_device_get_property_value(dev, "DEVNAME");
540 	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
541 	uuid = udev_device_get_property_value(dev, "DM_UUID");
542 
543 	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
544 	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
545 	    (uuid != NULL)) {
546 		return (B_TRUE);
547 	}
548 
549 	return (B_FALSE);
550 }
551 
552 static int
553 udev_device_is_ready(struct udev_device *dev)
554 {
555 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
556 	return (udev_device_get_is_initialized(dev));
557 #else
558 	/* wait for DEVLINKS property to be initialized */
559 	return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
560 #endif
561 }
562 
563 #else
564 
565 int
566 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
567 {
568 	(void) dev, (void) bufptr, (void) buflen;
569 	return (ENODATA);
570 }
571 
572 int
573 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
574 {
575 	(void) dev, (void) bufptr, (void) buflen;
576 	return (ENODATA);
577 }
578 
579 #endif /* HAVE_LIBUDEV */
580 
581 /*
582  * Wait up to timeout_ms for udev to set up the device node.  The device is
583  * considered ready when libudev determines it has been initialized, all of
584  * the device links have been verified to exist, and it has been allowed to
585  * settle.  At this point the device can be accessed reliably. Depending on
586  * the complexity of the udev rules this process could take several seconds.
587  */
588 int
589 zpool_label_disk_wait(const char *path, int timeout_ms)
590 {
591 #ifdef HAVE_LIBUDEV
592 	struct udev *udev;
593 	struct udev_device *dev = NULL;
594 	char nodepath[MAXPATHLEN];
595 	char *sysname = NULL;
596 	int ret = ENODEV;
597 	int settle_ms = 50;
598 	long sleep_ms = 10;
599 	hrtime_t start, settle;
600 
601 	if ((udev = udev_new()) == NULL)
602 		return (ENXIO);
603 
604 	start = gethrtime();
605 	settle = 0;
606 
607 	do {
608 		if (sysname == NULL) {
609 			if (realpath(path, nodepath) != NULL) {
610 				sysname = strrchr(nodepath, '/') + 1;
611 			} else {
612 				(void) usleep(sleep_ms * MILLISEC);
613 				continue;
614 			}
615 		}
616 
617 		dev = udev_device_new_from_subsystem_sysname(udev,
618 		    "block", sysname);
619 		if ((dev != NULL) && udev_device_is_ready(dev)) {
620 			struct udev_list_entry *links, *link = NULL;
621 
622 			ret = 0;
623 			links = udev_device_get_devlinks_list_entry(dev);
624 
625 			udev_list_entry_foreach(link, links) {
626 				struct stat64 statbuf;
627 				const char *name;
628 
629 				name = udev_list_entry_get_name(link);
630 				errno = 0;
631 				if (stat64(name, &statbuf) == 0 && errno == 0)
632 					continue;
633 
634 				settle = 0;
635 				ret = ENODEV;
636 				break;
637 			}
638 
639 			if (ret == 0) {
640 				if (settle == 0) {
641 					settle = gethrtime();
642 				} else if (NSEC2MSEC(gethrtime() - settle) >=
643 				    settle_ms) {
644 					udev_device_unref(dev);
645 					break;
646 				}
647 			}
648 		}
649 
650 		udev_device_unref(dev);
651 		(void) usleep(sleep_ms * MILLISEC);
652 
653 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
654 
655 	udev_unref(udev);
656 
657 	return (ret);
658 #else
659 	int settle_ms = 50;
660 	long sleep_ms = 10;
661 	hrtime_t start, settle;
662 	struct stat64 statbuf;
663 
664 	start = gethrtime();
665 	settle = 0;
666 
667 	do {
668 		errno = 0;
669 		if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
670 			if (settle == 0)
671 				settle = gethrtime();
672 			else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
673 				return (0);
674 		} else if (errno != ENOENT) {
675 			return (errno);
676 		}
677 
678 		usleep(sleep_ms * MILLISEC);
679 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
680 
681 	return (ENODEV);
682 #endif /* HAVE_LIBUDEV */
683 }
684 
685 /*
686  * Encode the persistent devices strings
687  * used for the vdev disk label
688  */
689 static int
690 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
691     boolean_t wholedisk)
692 {
693 #ifdef HAVE_LIBUDEV
694 	struct udev *udev;
695 	struct udev_device *dev = NULL;
696 	char nodepath[MAXPATHLEN];
697 	char *sysname;
698 	int ret = ENODEV;
699 	hrtime_t start;
700 
701 	if ((udev = udev_new()) == NULL)
702 		return (ENXIO);
703 
704 	/* resolve path to a runtime device node instance */
705 	if (realpath(path, nodepath) == NULL)
706 		goto no_dev;
707 
708 	sysname = strrchr(nodepath, '/') + 1;
709 
710 	/*
711 	 * Wait up to 3 seconds for udev to set up the device node context
712 	 */
713 	start = gethrtime();
714 	do {
715 		dev = udev_device_new_from_subsystem_sysname(udev, "block",
716 		    sysname);
717 		if (dev == NULL)
718 			goto no_dev;
719 		if (udev_device_is_ready(dev))
720 			break;  /* udev ready */
721 
722 		udev_device_unref(dev);
723 		dev = NULL;
724 
725 		if (NSEC2MSEC(gethrtime() - start) < 10)
726 			(void) sched_yield();	/* yield/busy wait up to 10ms */
727 		else
728 			(void) usleep(10 * MILLISEC);
729 
730 	} while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
731 
732 	if (dev == NULL)
733 		goto no_dev;
734 
735 	/*
736 	 * Only whole disks require extra device strings
737 	 */
738 	if (!wholedisk && !udev_mpath_whole_disk(dev))
739 		goto no_dev;
740 
741 	ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
742 	if (ret != 0)
743 		goto no_dev_ref;
744 
745 	/* physical location string (optional) */
746 	if (zfs_device_get_physical(dev, ds->vds_devphys,
747 	    sizeof (ds->vds_devphys)) != 0) {
748 		ds->vds_devphys[0] = '\0'; /* empty string --> not available */
749 	}
750 
751 no_dev_ref:
752 	udev_device_unref(dev);
753 no_dev:
754 	udev_unref(udev);
755 
756 	return (ret);
757 #else
758 	(void) path;
759 	(void) ds;
760 	(void) wholedisk;
761 	return (ENOENT);
762 #endif
763 }
764 
765 /*
766  * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it
767  * in the nvlist * (if applicable).  Like:
768  *    vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
769  *
770  * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH)
771  */
772 void
773 update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path,
774     const char *key)
775 {
776 	char *upath, *spath;
777 
778 	/* Add enclosure sysfs path (if disk is in an enclosure). */
779 	upath = zfs_get_underlying_path(path);
780 	spath = zfs_get_enclosure_sysfs_path(upath);
781 
782 	if (spath) {
783 		(void) nvlist_add_string(nv, key, spath);
784 	} else {
785 		(void) nvlist_remove_all(nv, key);
786 	}
787 
788 	free(upath);
789 	free(spath);
790 }
791 
792 /*
793  * This will get called for each leaf vdev.
794  */
795 static int
796 sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data)
797 {
798 	(void) hdl_data, (void) data;
799 
800 	const char *path = NULL;
801 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
802 		return (1);
803 
804 	/* Rescan our enclosure sysfs path for this vdev */
805 	update_vdev_config_dev_sysfs_path(nv, path,
806 	    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
807 	return (0);
808 }
809 
810 /*
811  * Given an nvlist for our pool (with vdev tree), iterate over all the
812  * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH.
813  */
814 void
815 update_vdevs_config_dev_sysfs_path(nvlist_t *config)
816 {
817 	nvlist_t *nvroot = NULL;
818 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
819 	    &nvroot) == 0);
820 	for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL);
821 }
822 
823 /*
824  * Update a leaf vdev's persistent device strings
825  *
826  * - only applies for a dedicated leaf vdev (aka whole disk)
827  * - updated during pool create|add|attach|import
828  * - used for matching device matching during auto-{online,expand,replace}
829  * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
830  * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
831  *
832  * single device node example:
833  * 	devid:		'scsi-MG03SCA300_350000494a8cb3d67-part1'
834  * 	phys_path:	'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
835  *
836  * multipath device node example:
837  * 	devid:		'dm-uuid-mpath-35000c5006304de3f'
838  *
839  * We also store the enclosure sysfs path for turning on enclosure LEDs
840  * (if applicable):
841  *	vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
842  */
843 void
844 update_vdev_config_dev_strs(nvlist_t *nv)
845 {
846 	vdev_dev_strs_t vds;
847 	const char *env, *type, *path;
848 	uint64_t wholedisk = 0;
849 
850 	/*
851 	 * For the benefit of legacy ZFS implementations, allow
852 	 * for opting out of devid strings in the vdev label.
853 	 *
854 	 * example use:
855 	 *	env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
856 	 *
857 	 * explanation:
858 	 * Older OpenZFS implementations had issues when attempting to
859 	 * display pool config VDEV names if a "devid" NVP value is
860 	 * present in the pool's config.
861 	 *
862 	 * For example, a pool that originated on illumos platform would
863 	 * have a devid value in the config and "zpool status" would fail
864 	 * when listing the config.
865 	 *
866 	 * A pool can be stripped of any "devid" values on import or
867 	 * prevented from adding them on zpool create|add by setting
868 	 * ZFS_VDEV_DEVID_OPT_OUT.
869 	 */
870 	env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
871 	if (env && (strtoul(env, NULL, 0) > 0 ||
872 	    !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
873 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
874 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
875 		return;
876 	}
877 
878 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
879 	    strcmp(type, VDEV_TYPE_DISK) != 0) {
880 		return;
881 	}
882 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
883 		return;
884 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
885 
886 	/*
887 	 * Update device string values in the config nvlist.
888 	 */
889 	if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
890 		(void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
891 		if (vds.vds_devphys[0] != '\0') {
892 			(void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
893 			    vds.vds_devphys);
894 		}
895 		update_vdev_config_dev_sysfs_path(nv, path,
896 		    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
897 	} else {
898 		/* Clear out any stale entries. */
899 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
900 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
901 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
902 	}
903 }
904