1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <ctype.h>
27 #include <dirent.h>
28 #include <fcntl.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <sys/efi_partition.h>
33 
34 #ifdef HAVE_LIBUDEV
35 #include <libudev.h>
36 #endif
37 
38 #include <libzutil.h>
39 
40 /*
41  * Append partition suffix to an otherwise fully qualified device path.
42  * This is used to generate the name the full path as its stored in
43  * ZPOOL_CONFIG_PATH for whole disk devices.  On success the new length
44  * of 'path' will be returned on error a negative value is returned.
45  */
46 int
47 zfs_append_partition(char *path, size_t max_len)
48 {
49 	int len = strlen(path);
50 
51 	if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
52 	    (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
53 		if (len + 6 >= max_len)
54 			return (-1);
55 
56 		(void) strcat(path, "-part1");
57 		len += 6;
58 	} else {
59 		if (len + 2 >= max_len)
60 			return (-1);
61 
62 		if (isdigit(path[len-1])) {
63 			(void) strcat(path, "p1");
64 			len += 2;
65 		} else {
66 			(void) strcat(path, "1");
67 			len += 1;
68 		}
69 	}
70 
71 	return (len);
72 }
73 
74 /*
75  * Remove partition suffix from a vdev path.  Partition suffixes may take three
76  * forms: "-partX", "pX", or "X", where X is a string of digits.  The second
77  * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
78  * third case only occurs when preceded by a string matching the regular
79  * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
80  *
81  * caller must free the returned string
82  */
83 char *
84 zfs_strip_partition(char *path)
85 {
86 	char *tmp = strdup(path);
87 	char *part = NULL, *d = NULL;
88 	if (!tmp)
89 		return (NULL);
90 
91 	if ((part = strstr(tmp, "-part")) && part != tmp) {
92 		d = part + 5;
93 	} else if ((part = strrchr(tmp, 'p')) &&
94 	    part > tmp + 1 && isdigit(*(part-1))) {
95 		d = part + 1;
96 	} else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
97 	    tmp[1] == 'd') {
98 		for (d = &tmp[2]; isalpha(*d); part = ++d) { }
99 	} else if (strncmp("xvd", tmp, 3) == 0) {
100 		for (d = &tmp[3]; isalpha(*d); part = ++d) { }
101 	}
102 	if (part && d && *d != '\0') {
103 		for (; isdigit(*d); d++) { }
104 		if (*d == '\0')
105 			*part = '\0';
106 	}
107 
108 	return (tmp);
109 }
110 
111 /*
112  * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
113  *
114  * path:	/dev/sda1
115  * returns:	/dev/sda
116  *
117  * Returned string must be freed.
118  */
119 static char *
120 zfs_strip_partition_path(char *path)
121 {
122 	char *newpath = strdup(path);
123 	char *sd_offset;
124 	char *new_sd;
125 
126 	if (!newpath)
127 		return (NULL);
128 
129 	/* Point to "sda1" part of "/dev/sda1" */
130 	sd_offset = strrchr(newpath, '/') + 1;
131 
132 	/* Get our new name "sda" */
133 	new_sd = zfs_strip_partition(sd_offset);
134 	if (!new_sd) {
135 		free(newpath);
136 		return (NULL);
137 	}
138 
139 	/* Paste the "sda" where "sda1" was */
140 	strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
141 
142 	/* Free temporary "sda" */
143 	free(new_sd);
144 
145 	return (newpath);
146 }
147 
148 /*
149  * Strip the unwanted portion of a device path.
150  */
151 char *
152 zfs_strip_path(char *path)
153 {
154 	return (strrchr(path, '/') + 1);
155 }
156 
157 /*
158  * Read the contents of a sysfs file into an allocated buffer and remove the
159  * last newline.
160  *
161  * This is useful for reading sysfs files that return a single string.  Return
162  * an allocated string pointer on success, NULL otherwise.  Returned buffer
163  * must be freed by the user.
164  */
165 static char *
166 zfs_read_sysfs_file(char *filepath)
167 {
168 	char buf[4096];	/* all sysfs files report 4k size */
169 	char *str = NULL;
170 
171 	FILE *fp = fopen(filepath, "r");
172 	if (fp == NULL) {
173 		return (NULL);
174 	}
175 	if (fgets(buf, sizeof (buf), fp) == buf) {
176 		/* success */
177 
178 		/* Remove the last newline (if any) */
179 		size_t len = strlen(buf);
180 		if (buf[len - 1] == '\n') {
181 			buf[len - 1] = '\0';
182 		}
183 		str = strdup(buf);
184 	}
185 
186 	fclose(fp);
187 
188 	return (str);
189 }
190 
191 /*
192  * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
193  * the drive (in /sys/bus/pci/slots).
194  *
195  * For example:
196  *     dev:            "nvme0n1"
197  *     returns:        "/sys/bus/pci/slots/0"
198  *
199  * 'dev' must be an NVMe device.
200  *
201  * Returned string must be freed.  Returns NULL on error or no sysfs path.
202  */
203 static char *
204 zfs_get_pci_slots_sys_path(const char *dev_name)
205 {
206 	DIR *dp = NULL;
207 	struct dirent *ep;
208 	char *address1 = NULL;
209 	char *address2 = NULL;
210 	char *path = NULL;
211 	char buf[MAXPATHLEN];
212 	char *tmp;
213 
214 	/* If they preface 'dev' with a path (like "/dev") then strip it off */
215 	tmp = strrchr(dev_name, '/');
216 	if (tmp != NULL)
217 		dev_name = tmp + 1;    /* +1 since we want the chr after '/' */
218 
219 	if (strncmp("nvme", dev_name, 4) != 0)
220 		return (NULL);
221 
222 	(void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address",
223 	    dev_name);
224 
225 	address1 = zfs_read_sysfs_file(buf);
226 	if (!address1)
227 		return (NULL);
228 
229 	/*
230 	 * /sys/block/nvme0n1/device/address format will
231 	 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
232 	 * "0000:01:00".  Just NULL terminate at the '.' so they match.
233 	 */
234 	tmp = strrchr(address1, '.');
235 	if (tmp != NULL)
236 		*tmp = '\0';
237 
238 	dp = opendir("/sys/bus/pci/slots/");
239 	if (dp == NULL) {
240 		free(address1);
241 		return (NULL);
242 	}
243 
244 	/*
245 	 * Look through all the /sys/bus/pci/slots/ subdirs
246 	 */
247 	while ((ep = readdir(dp))) {
248 		/*
249 		 * We only care about directory names that are a single number.
250 		 * Sometimes there's other directories like
251 		 * "/sys/bus/pci/slots/0-3/" in there - skip those.
252 		 */
253 		if (!zfs_isnumber(ep->d_name))
254 			continue;
255 
256 		(void) snprintf(buf, sizeof (buf),
257 		    "/sys/bus/pci/slots/%s/address", ep->d_name);
258 
259 		address2 = zfs_read_sysfs_file(buf);
260 		if (!address2)
261 			continue;
262 
263 		if (strcmp(address1, address2) == 0) {
264 			/* Addresses match, we're all done */
265 			free(address2);
266 			if (asprintf(&path, "/sys/bus/pci/slots/%s",
267 			    ep->d_name) == -1) {
268 				free(tmp);
269 				continue;
270 			}
271 			break;
272 		}
273 		free(address2);
274 	}
275 
276 	closedir(dp);
277 	free(address1);
278 
279 	return (path);
280 }
281 
282 /*
283  * Given a dev name like "sda", return the full enclosure sysfs path to
284  * the disk.  You can also pass in the name with "/dev" prepended
285  * to it (like /dev/sda).  This works for both JBODs and NVMe PCI devices.
286  *
287  * For example, disk "sda" in enclosure slot 1:
288  *     dev_name:       "sda"
289  *     returns:        "/sys/class/enclosure/1:0:3:0/Slot 1"
290  *
291  * Or:
292  *
293  *      dev_name:   "nvme0n1"
294  *      returns:    "/sys/bus/pci/slots/0"
295  *
296  * 'dev' must be a non-devicemapper device.
297  *
298  * Returned string must be freed.  Returns NULL on error.
299  */
300 char *
301 zfs_get_enclosure_sysfs_path(const char *dev_name)
302 {
303 	DIR *dp = NULL;
304 	struct dirent *ep;
305 	char buf[MAXPATHLEN];
306 	char *tmp1 = NULL;
307 	char *tmp2 = NULL;
308 	char *tmp3 = NULL;
309 	char *path = NULL;
310 	size_t size;
311 	int tmpsize;
312 
313 	if (dev_name == NULL)
314 		return (NULL);
315 
316 	/* If they preface 'dev' with a path (like "/dev") then strip it off */
317 	tmp1 = strrchr(dev_name, '/');
318 	if (tmp1 != NULL)
319 		dev_name = tmp1 + 1;    /* +1 since we want the chr after '/' */
320 
321 	tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
322 	if (tmpsize == -1 || tmp1 == NULL) {
323 		tmp1 = NULL;
324 		goto end;
325 	}
326 
327 	dp = opendir(tmp1);
328 	if (dp == NULL)
329 		goto end;
330 
331 	/*
332 	 * Look though all sysfs entries in /sys/block/<dev>/device for
333 	 * the enclosure symlink.
334 	 */
335 	while ((ep = readdir(dp))) {
336 		/* Ignore everything that's not our enclosure_device link */
337 		if (strstr(ep->d_name, "enclosure_device") == NULL)
338 			continue;
339 
340 		if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) {
341 			tmp2 = NULL;
342 			break;
343 		}
344 
345 		size = readlink(tmp2, buf, sizeof (buf));
346 
347 		/* Did readlink fail or crop the link name? */
348 		if (size == -1 || size >= sizeof (buf))
349 			break;
350 
351 		/*
352 		 * We got a valid link.  readlink() doesn't terminate strings
353 		 * so we have to do it.
354 		 */
355 		buf[size] = '\0';
356 
357 		/*
358 		 * Our link will look like:
359 		 *
360 		 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
361 		 *
362 		 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
363 		 */
364 		tmp3 = strstr(buf, "enclosure");
365 		if (tmp3 == NULL)
366 			break;
367 
368 		if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
369 			/* If asprintf() fails, 'path' is undefined */
370 			path = NULL;
371 			break;
372 		}
373 
374 		if (path == NULL)
375 			break;
376 	}
377 
378 end:
379 	free(tmp2);
380 	free(tmp1);
381 
382 	if (dp != NULL)
383 		closedir(dp);
384 
385 	if (!path) {
386 		/*
387 		 * This particular disk isn't in a JBOD.  It could be an NVMe
388 		 * drive. If so, look up the NVMe device's path in
389 		 * /sys/bus/pci/slots/. Within that directory is a 'attention'
390 		 * file which controls the NVMe fault LED.
391 		 */
392 		path = zfs_get_pci_slots_sys_path(dev_name);
393 	}
394 
395 	return (path);
396 }
397 
398 /*
399  * Allocate and return the underlying device name for a device mapper device.
400  *
401  * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
402  * DM device (like /dev/disk/by-vdev/A0) are also allowed.
403  *
404  * If the DM device has multiple underlying devices (like with multipath
405  * DM devices), then favor underlying devices that have a symlink back to their
406  * back to their enclosure device in sysfs.  This will be useful for the
407  * zedlet scripts that toggle the fault LED.
408  *
409  * Returns an underlying device name, or NULL on error or no match.  If dm_name
410  * is not a DM device then return NULL.
411  *
412  * NOTE: The returned name string must be *freed*.
413  */
414 static char *
415 dm_get_underlying_path(const char *dm_name)
416 {
417 	DIR *dp = NULL;
418 	struct dirent *ep;
419 	char *realp;
420 	char *tmp = NULL;
421 	char *path = NULL;
422 	char *dev_str;
423 	int size;
424 	char *first_path = NULL;
425 	char *enclosure_path;
426 
427 	if (dm_name == NULL)
428 		return (NULL);
429 
430 	/* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
431 	realp = realpath(dm_name, NULL);
432 	if (realp == NULL)
433 		return (NULL);
434 
435 	/*
436 	 * If they preface 'dev' with a path (like "/dev") then strip it off.
437 	 * We just want the 'dm-N' part.
438 	 */
439 	tmp = strrchr(realp, '/');
440 	if (tmp != NULL)
441 		dev_str = tmp + 1;    /* +1 since we want the chr after '/' */
442 	else
443 		dev_str = tmp;
444 
445 	if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) {
446 		tmp = NULL;
447 		goto end;
448 	}
449 
450 	dp = opendir(tmp);
451 	if (dp == NULL)
452 		goto end;
453 
454 	/*
455 	 * A device-mapper device can have multiple paths to it (multipath).
456 	 * Favor paths that have a symlink back to their enclosure device.
457 	 * We have to do this since some enclosures may only provide a symlink
458 	 * back for one underlying path to a disk and not the other.
459 	 *
460 	 * If no paths have links back to their enclosure, then just return the
461 	 * first path.
462 	 */
463 	while ((ep = readdir(dp))) {
464 		if (ep->d_type != DT_DIR) {	/* skip "." and ".." dirs */
465 			if (!first_path)
466 				first_path = strdup(ep->d_name);
467 
468 			enclosure_path =
469 			    zfs_get_enclosure_sysfs_path(ep->d_name);
470 
471 			if (!enclosure_path)
472 				continue;
473 
474 			if ((size = asprintf(
475 			    &path, "/dev/%s", ep->d_name)) == -1)
476 				path = NULL;
477 			free(enclosure_path);
478 			break;
479 		}
480 	}
481 
482 end:
483 	if (dp != NULL)
484 		closedir(dp);
485 	free(tmp);
486 	free(realp);
487 
488 	if (!path && first_path) {
489 		/*
490 		 * None of the underlying paths had a link back to their
491 		 * enclosure devices.  Throw up out hands and return the first
492 		 * underlying path.
493 		 */
494 		if ((size = asprintf(&path, "/dev/%s", first_path)) == -1)
495 			path = NULL;
496 	}
497 
498 	free(first_path);
499 	return (path);
500 }
501 
502 /*
503  * Return B_TRUE if device is a device mapper or multipath device.
504  * Return B_FALSE if not.
505  */
506 boolean_t
507 zfs_dev_is_dm(const char *dev_name)
508 {
509 
510 	char *tmp;
511 	tmp = dm_get_underlying_path(dev_name);
512 	if (tmp == NULL)
513 		return (B_FALSE);
514 
515 	free(tmp);
516 	return (B_TRUE);
517 }
518 
519 /*
520  * By "whole disk" we mean an entire physical disk (something we can
521  * label, toggle the write cache on, etc.) as opposed to the full
522  * capacity of a pseudo-device such as lofi or did.  We act as if we
523  * are labeling the disk, which should be a pretty good test of whether
524  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
525  * it isn't.
526  */
527 boolean_t
528 zfs_dev_is_whole_disk(const char *dev_name)
529 {
530 	struct dk_gpt *label = NULL;
531 	int fd;
532 
533 	if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
534 		return (B_FALSE);
535 
536 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
537 		(void) close(fd);
538 		return (B_FALSE);
539 	}
540 
541 	efi_free(label);
542 	(void) close(fd);
543 
544 	return (B_TRUE);
545 }
546 
547 /*
548  * Lookup the underlying device for a device name
549  *
550  * Often you'll have a symlink to a device, a partition device,
551  * or a multipath device, and want to look up the underlying device.
552  * This function returns the underlying device name.  If the device
553  * name is already the underlying device, then just return the same
554  * name.  If the device is a DM device with multiple underlying devices
555  * then return the first one.
556  *
557  * For example:
558  *
559  * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
560  * dev_name:	/dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
561  * returns:	/dev/sda
562  *
563  * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
564  * dev_name:	/dev/mapper/mpatha
565  * returns:	/dev/sda (first device)
566  *
567  * 3. /dev/sda (already the underlying device)
568  * dev_name:	/dev/sda
569  * returns:	/dev/sda
570  *
571  * 4. /dev/dm-3 (mapped to /dev/sda)
572  * dev_name:	/dev/dm-3
573  * returns:	/dev/sda
574  *
575  * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
576  * dev_name:	/dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
577  * returns:	/dev/sdb
578  *
579  * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
580  * dev_name:	/dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
581  * returns:	/dev/sda
582  *
583  * Returns underlying device name, or NULL on error or no match.
584  *
585  * NOTE: The returned name string must be *freed*.
586  */
587 char *
588 zfs_get_underlying_path(const char *dev_name)
589 {
590 	char *name = NULL;
591 	char *tmp;
592 
593 	if (dev_name == NULL)
594 		return (NULL);
595 
596 	tmp = dm_get_underlying_path(dev_name);
597 
598 	/* dev_name not a DM device, so just un-symlinkize it */
599 	if (tmp == NULL)
600 		tmp = realpath(dev_name, NULL);
601 
602 	if (tmp != NULL) {
603 		name = zfs_strip_partition_path(tmp);
604 		free(tmp);
605 	}
606 
607 	return (name);
608 }
609 
610 
611 #ifdef HAVE_LIBUDEV
612 
613 /*
614  * A disk is considered a multipath whole disk when:
615  *	DEVNAME key value has "dm-"
616  *	DM_UUID key exists and starts with 'mpath-'
617  *	ID_PART_TABLE_TYPE key does not exist or is not gpt
618  *	ID_FS_LABEL key does not exist (disk isn't labeled)
619  */
620 static boolean_t
621 is_mpath_udev_sane(struct udev_device *dev)
622 {
623 	const char *devname, *type, *uuid, *label;
624 
625 	devname = udev_device_get_property_value(dev, "DEVNAME");
626 	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
627 	uuid = udev_device_get_property_value(dev, "DM_UUID");
628 	label = udev_device_get_property_value(dev, "ID_FS_LABEL");
629 
630 	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
631 	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
632 	    ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) &&
633 	    (label == NULL)) {
634 		return (B_TRUE);
635 	}
636 
637 	return (B_FALSE);
638 }
639 
640 /*
641  * Check if a disk is a multipath "blank" disk:
642  *
643  * 1. The disk has udev values that suggest it's a multipath disk
644  * 2. The disk is not currently labeled with a filesystem of any type
645  * 3. There are no partitions on the disk
646  */
647 boolean_t
648 is_mpath_whole_disk(const char *path)
649 {
650 	struct udev *udev;
651 	struct udev_device *dev = NULL;
652 	char nodepath[MAXPATHLEN];
653 	char *sysname;
654 
655 	if (realpath(path, nodepath) == NULL)
656 		return (B_FALSE);
657 	sysname = strrchr(nodepath, '/') + 1;
658 	if (strncmp(sysname, "dm-", 3) != 0)
659 		return (B_FALSE);
660 	if ((udev = udev_new()) == NULL)
661 		return (B_FALSE);
662 	if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
663 	    sysname)) == NULL) {
664 		udev_device_unref(dev);
665 		return (B_FALSE);
666 	}
667 
668 	/* Sanity check some udev values */
669 	boolean_t is_sane = is_mpath_udev_sane(dev);
670 	udev_device_unref(dev);
671 
672 	return (is_sane);
673 }
674 
675 #else /* HAVE_LIBUDEV */
676 
677 boolean_t
678 is_mpath_whole_disk(const char *path)
679 {
680 	(void) path;
681 	return (B_FALSE);
682 }
683 
684 #endif /* HAVE_LIBUDEV */
685