1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <ctype.h> 27 #include <dirent.h> 28 #include <fcntl.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/efi_partition.h> 33 34 #ifdef HAVE_LIBUDEV 35 #include <libudev.h> 36 #endif 37 38 #include <libzutil.h> 39 40 /* 41 * Append partition suffix to an otherwise fully qualified device path. 42 * This is used to generate the name the full path as its stored in 43 * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length 44 * of 'path' will be returned on error a negative value is returned. 45 */ 46 int 47 zfs_append_partition(char *path, size_t max_len) 48 { 49 int len = strlen(path); 50 51 if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || 52 (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { 53 if (len + 6 >= max_len) 54 return (-1); 55 56 (void) strcat(path, "-part1"); 57 len += 6; 58 } else { 59 if (len + 2 >= max_len) 60 return (-1); 61 62 if (isdigit(path[len-1])) { 63 (void) strcat(path, "p1"); 64 len += 2; 65 } else { 66 (void) strcat(path, "1"); 67 len += 1; 68 } 69 } 70 71 return (len); 72 } 73 74 /* 75 * Remove partition suffix from a vdev path. Partition suffixes may take three 76 * forms: "-partX", "pX", or "X", where X is a string of digits. The second 77 * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The 78 * third case only occurs when preceded by a string matching the regular 79 * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. 80 * 81 * caller must free the returned string 82 */ 83 char * 84 zfs_strip_partition(const char *path) 85 { 86 char *tmp = strdup(path); 87 char *part = NULL, *d = NULL; 88 if (!tmp) 89 return (NULL); 90 91 if ((part = strstr(tmp, "-part")) && part != tmp) { 92 d = part + 5; 93 } else if ((part = strrchr(tmp, 'p')) && 94 part > tmp + 1 && isdigit(*(part-1))) { 95 d = part + 1; 96 } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && 97 tmp[1] == 'd') { 98 for (d = &tmp[2]; isalpha(*d); part = ++d) { } 99 } else if (strncmp("xvd", tmp, 3) == 0) { 100 for (d = &tmp[3]; isalpha(*d); part = ++d) { } 101 } 102 if (part && d && *d != '\0') { 103 for (; isdigit(*d); d++) { } 104 if (*d == '\0') 105 *part = '\0'; 106 } 107 108 return (tmp); 109 } 110 111 /* 112 * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname 113 * 114 * path: /dev/sda1 115 * returns: /dev/sda 116 * 117 * Returned string must be freed. 118 */ 119 static char * 120 zfs_strip_partition_path(const char *path) 121 { 122 char *newpath = strdup(path); 123 char *sd_offset; 124 char *new_sd; 125 126 if (!newpath) 127 return (NULL); 128 129 /* Point to "sda1" part of "/dev/sda1" */ 130 sd_offset = strrchr(newpath, '/') + 1; 131 132 /* Get our new name "sda" */ 133 new_sd = zfs_strip_partition(sd_offset); 134 if (!new_sd) { 135 free(newpath); 136 return (NULL); 137 } 138 139 /* Paste the "sda" where "sda1" was */ 140 strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); 141 142 /* Free temporary "sda" */ 143 free(new_sd); 144 145 return (newpath); 146 } 147 148 /* 149 * Strip the unwanted portion of a device path. 150 */ 151 const char * 152 zfs_strip_path(const char *path) 153 { 154 size_t spath_count; 155 const char *const *spaths = zpool_default_search_paths(&spath_count); 156 157 for (size_t i = 0; i < spath_count; ++i) 158 if (strncmp(path, spaths[i], strlen(spaths[i])) == 0 && 159 path[strlen(spaths[i])] == '/') 160 return (path + strlen(spaths[i]) + 1); 161 162 return (path); 163 } 164 165 /* 166 * Read the contents of a sysfs file into an allocated buffer and remove the 167 * last newline. 168 * 169 * This is useful for reading sysfs files that return a single string. Return 170 * an allocated string pointer on success, NULL otherwise. Returned buffer 171 * must be freed by the user. 172 */ 173 static char * 174 zfs_read_sysfs_file(char *filepath) 175 { 176 char buf[4096]; /* all sysfs files report 4k size */ 177 char *str = NULL; 178 179 FILE *fp = fopen(filepath, "r"); 180 if (fp == NULL) { 181 return (NULL); 182 } 183 if (fgets(buf, sizeof (buf), fp) == buf) { 184 /* success */ 185 186 /* Remove the last newline (if any) */ 187 size_t len = strlen(buf); 188 if (buf[len - 1] == '\n') { 189 buf[len - 1] = '\0'; 190 } 191 str = strdup(buf); 192 } 193 194 fclose(fp); 195 196 return (str); 197 } 198 199 /* 200 * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to 201 * the drive (in /sys/bus/pci/slots). 202 * 203 * For example: 204 * dev: "nvme0n1" 205 * returns: "/sys/bus/pci/slots/0" 206 * 207 * 'dev' must be an NVMe device. 208 * 209 * Returned string must be freed. Returns NULL on error or no sysfs path. 210 */ 211 static char * 212 zfs_get_pci_slots_sys_path(const char *dev_name) 213 { 214 DIR *dp = NULL; 215 struct dirent *ep; 216 char *address1 = NULL; 217 char *address2 = NULL; 218 char *path = NULL; 219 char buf[MAXPATHLEN]; 220 char *tmp; 221 222 /* If they preface 'dev' with a path (like "/dev") then strip it off */ 223 tmp = strrchr(dev_name, '/'); 224 if (tmp != NULL) 225 dev_name = tmp + 1; /* +1 since we want the chr after '/' */ 226 227 if (strncmp("nvme", dev_name, 4) != 0) 228 return (NULL); 229 230 (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", 231 dev_name); 232 233 address1 = zfs_read_sysfs_file(buf); 234 if (!address1) 235 return (NULL); 236 237 /* 238 * /sys/block/nvme0n1/device/address format will 239 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be 240 * "0000:01:00". Just NULL terminate at the '.' so they match. 241 */ 242 tmp = strrchr(address1, '.'); 243 if (tmp != NULL) 244 *tmp = '\0'; 245 246 dp = opendir("/sys/bus/pci/slots/"); 247 if (dp == NULL) { 248 free(address1); 249 return (NULL); 250 } 251 252 /* 253 * Look through all the /sys/bus/pci/slots/ subdirs 254 */ 255 while ((ep = readdir(dp))) { 256 /* 257 * We only care about directory names that are a single number. 258 * Sometimes there's other directories like 259 * "/sys/bus/pci/slots/0-3/" in there - skip those. 260 */ 261 if (!zfs_isnumber(ep->d_name)) 262 continue; 263 264 (void) snprintf(buf, sizeof (buf), 265 "/sys/bus/pci/slots/%s/address", ep->d_name); 266 267 address2 = zfs_read_sysfs_file(buf); 268 if (!address2) 269 continue; 270 271 if (strcmp(address1, address2) == 0) { 272 /* Addresses match, we're all done */ 273 free(address2); 274 if (asprintf(&path, "/sys/bus/pci/slots/%s", 275 ep->d_name) == -1) { 276 free(tmp); 277 continue; 278 } 279 break; 280 } 281 free(address2); 282 } 283 284 closedir(dp); 285 free(address1); 286 287 return (path); 288 } 289 290 /* 291 * Given a dev name like "sda", return the full enclosure sysfs path to 292 * the disk. You can also pass in the name with "/dev" prepended 293 * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. 294 * 295 * For example, disk "sda" in enclosure slot 1: 296 * dev_name: "sda" 297 * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" 298 * 299 * Or: 300 * 301 * dev_name: "nvme0n1" 302 * returns: "/sys/bus/pci/slots/0" 303 * 304 * 'dev' must be a non-devicemapper device. 305 * 306 * Returned string must be freed. Returns NULL on error. 307 */ 308 char * 309 zfs_get_enclosure_sysfs_path(const char *dev_name) 310 { 311 DIR *dp = NULL; 312 struct dirent *ep; 313 char buf[MAXPATHLEN]; 314 char *tmp1 = NULL; 315 char *tmp2 = NULL; 316 char *tmp3 = NULL; 317 char *path = NULL; 318 size_t size; 319 int tmpsize; 320 321 if (dev_name == NULL) 322 return (NULL); 323 324 /* If they preface 'dev' with a path (like "/dev") then strip it off */ 325 tmp1 = strrchr(dev_name, '/'); 326 if (tmp1 != NULL) 327 dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ 328 329 tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); 330 if (tmpsize == -1 || tmp1 == NULL) { 331 tmp1 = NULL; 332 goto end; 333 } 334 335 dp = opendir(tmp1); 336 if (dp == NULL) 337 goto end; 338 339 /* 340 * Look though all sysfs entries in /sys/block/<dev>/device for 341 * the enclosure symlink. 342 */ 343 while ((ep = readdir(dp))) { 344 /* Ignore everything that's not our enclosure_device link */ 345 if (strstr(ep->d_name, "enclosure_device") == NULL) 346 continue; 347 348 if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) { 349 tmp2 = NULL; 350 break; 351 } 352 353 size = readlink(tmp2, buf, sizeof (buf)); 354 355 /* Did readlink fail or crop the link name? */ 356 if (size == -1 || size >= sizeof (buf)) 357 break; 358 359 /* 360 * We got a valid link. readlink() doesn't terminate strings 361 * so we have to do it. 362 */ 363 buf[size] = '\0'; 364 365 /* 366 * Our link will look like: 367 * 368 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" 369 * 370 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part 371 */ 372 tmp3 = strstr(buf, "enclosure"); 373 if (tmp3 == NULL) 374 break; 375 376 if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { 377 /* If asprintf() fails, 'path' is undefined */ 378 path = NULL; 379 break; 380 } 381 382 if (path == NULL) 383 break; 384 } 385 386 end: 387 free(tmp2); 388 free(tmp1); 389 390 if (dp != NULL) 391 closedir(dp); 392 393 if (!path) { 394 /* 395 * This particular disk isn't in a JBOD. It could be an NVMe 396 * drive. If so, look up the NVMe device's path in 397 * /sys/bus/pci/slots/. Within that directory is a 'attention' 398 * file which controls the NVMe fault LED. 399 */ 400 path = zfs_get_pci_slots_sys_path(dev_name); 401 } 402 403 return (path); 404 } 405 406 /* 407 * Allocate and return the underlying device name for a device mapper device. 408 * 409 * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a 410 * DM device (like /dev/disk/by-vdev/A0) are also allowed. 411 * 412 * If the DM device has multiple underlying devices (like with multipath 413 * DM devices), then favor underlying devices that have a symlink back to their 414 * back to their enclosure device in sysfs. This will be useful for the 415 * zedlet scripts that toggle the fault LED. 416 * 417 * Returns an underlying device name, or NULL on error or no match. If dm_name 418 * is not a DM device then return NULL. 419 * 420 * NOTE: The returned name string must be *freed*. 421 */ 422 static char * 423 dm_get_underlying_path(const char *dm_name) 424 { 425 DIR *dp = NULL; 426 struct dirent *ep; 427 char *realp; 428 char *tmp = NULL; 429 char *path = NULL; 430 char *dev_str; 431 int size; 432 char *first_path = NULL; 433 char *enclosure_path; 434 435 if (dm_name == NULL) 436 return (NULL); 437 438 /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ 439 realp = realpath(dm_name, NULL); 440 if (realp == NULL) 441 return (NULL); 442 443 /* 444 * If they preface 'dev' with a path (like "/dev") then strip it off. 445 * We just want the 'dm-N' part. 446 */ 447 tmp = strrchr(realp, '/'); 448 if (tmp != NULL) 449 dev_str = tmp + 1; /* +1 since we want the chr after '/' */ 450 else 451 dev_str = tmp; 452 453 if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) { 454 tmp = NULL; 455 goto end; 456 } 457 458 dp = opendir(tmp); 459 if (dp == NULL) 460 goto end; 461 462 /* 463 * A device-mapper device can have multiple paths to it (multipath). 464 * Favor paths that have a symlink back to their enclosure device. 465 * We have to do this since some enclosures may only provide a symlink 466 * back for one underlying path to a disk and not the other. 467 * 468 * If no paths have links back to their enclosure, then just return the 469 * first path. 470 */ 471 while ((ep = readdir(dp))) { 472 if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ 473 if (!first_path) 474 first_path = strdup(ep->d_name); 475 476 enclosure_path = 477 zfs_get_enclosure_sysfs_path(ep->d_name); 478 479 if (!enclosure_path) 480 continue; 481 482 if ((size = asprintf( 483 &path, "/dev/%s", ep->d_name)) == -1) 484 path = NULL; 485 free(enclosure_path); 486 break; 487 } 488 } 489 490 end: 491 if (dp != NULL) 492 closedir(dp); 493 free(tmp); 494 free(realp); 495 496 if (!path && first_path) { 497 /* 498 * None of the underlying paths had a link back to their 499 * enclosure devices. Throw up out hands and return the first 500 * underlying path. 501 */ 502 if ((size = asprintf(&path, "/dev/%s", first_path)) == -1) 503 path = NULL; 504 } 505 506 free(first_path); 507 return (path); 508 } 509 510 /* 511 * Return B_TRUE if device is a device mapper or multipath device. 512 * Return B_FALSE if not. 513 */ 514 boolean_t 515 zfs_dev_is_dm(const char *dev_name) 516 { 517 518 char *tmp; 519 tmp = dm_get_underlying_path(dev_name); 520 if (tmp == NULL) 521 return (B_FALSE); 522 523 free(tmp); 524 return (B_TRUE); 525 } 526 527 /* 528 * By "whole disk" we mean an entire physical disk (something we can 529 * label, toggle the write cache on, etc.) as opposed to the full 530 * capacity of a pseudo-device such as lofi or did. We act as if we 531 * are labeling the disk, which should be a pretty good test of whether 532 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 533 * it isn't. 534 */ 535 boolean_t 536 zfs_dev_is_whole_disk(const char *dev_name) 537 { 538 struct dk_gpt *label = NULL; 539 int fd; 540 541 if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) 542 return (B_FALSE); 543 544 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 545 (void) close(fd); 546 return (B_FALSE); 547 } 548 549 efi_free(label); 550 (void) close(fd); 551 552 return (B_TRUE); 553 } 554 555 /* 556 * Lookup the underlying device for a device name 557 * 558 * Often you'll have a symlink to a device, a partition device, 559 * or a multipath device, and want to look up the underlying device. 560 * This function returns the underlying device name. If the device 561 * name is already the underlying device, then just return the same 562 * name. If the device is a DM device with multiple underlying devices 563 * then return the first one. 564 * 565 * For example: 566 * 567 * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda 568 * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 569 * returns: /dev/sda 570 * 571 * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) 572 * dev_name: /dev/mapper/mpatha 573 * returns: /dev/sda (first device) 574 * 575 * 3. /dev/sda (already the underlying device) 576 * dev_name: /dev/sda 577 * returns: /dev/sda 578 * 579 * 4. /dev/dm-3 (mapped to /dev/sda) 580 * dev_name: /dev/dm-3 581 * returns: /dev/sda 582 * 583 * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 584 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 585 * returns: /dev/sdb 586 * 587 * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 588 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a 589 * returns: /dev/sda 590 * 591 * Returns underlying device name, or NULL on error or no match. 592 * 593 * NOTE: The returned name string must be *freed*. 594 */ 595 char * 596 zfs_get_underlying_path(const char *dev_name) 597 { 598 char *name = NULL; 599 char *tmp; 600 601 if (dev_name == NULL) 602 return (NULL); 603 604 tmp = dm_get_underlying_path(dev_name); 605 606 /* dev_name not a DM device, so just un-symlinkize it */ 607 if (tmp == NULL) 608 tmp = realpath(dev_name, NULL); 609 610 if (tmp != NULL) { 611 name = zfs_strip_partition_path(tmp); 612 free(tmp); 613 } 614 615 return (name); 616 } 617 618 619 #ifdef HAVE_LIBUDEV 620 621 /* 622 * A disk is considered a multipath whole disk when: 623 * DEVNAME key value has "dm-" 624 * DM_UUID key exists and starts with 'mpath-' 625 * ID_PART_TABLE_TYPE key does not exist or is not gpt 626 * ID_FS_LABEL key does not exist (disk isn't labeled) 627 */ 628 static boolean_t 629 is_mpath_udev_sane(struct udev_device *dev) 630 { 631 const char *devname, *type, *uuid, *label; 632 633 devname = udev_device_get_property_value(dev, "DEVNAME"); 634 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); 635 uuid = udev_device_get_property_value(dev, "DM_UUID"); 636 label = udev_device_get_property_value(dev, "ID_FS_LABEL"); 637 638 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && 639 ((type == NULL) || (strcmp(type, "gpt") != 0)) && 640 ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) && 641 (label == NULL)) { 642 return (B_TRUE); 643 } 644 645 return (B_FALSE); 646 } 647 648 /* 649 * Check if a disk is a multipath "blank" disk: 650 * 651 * 1. The disk has udev values that suggest it's a multipath disk 652 * 2. The disk is not currently labeled with a filesystem of any type 653 * 3. There are no partitions on the disk 654 */ 655 boolean_t 656 is_mpath_whole_disk(const char *path) 657 { 658 struct udev *udev; 659 struct udev_device *dev = NULL; 660 char nodepath[MAXPATHLEN]; 661 char *sysname; 662 663 if (realpath(path, nodepath) == NULL) 664 return (B_FALSE); 665 sysname = strrchr(nodepath, '/') + 1; 666 if (strncmp(sysname, "dm-", 3) != 0) 667 return (B_FALSE); 668 if ((udev = udev_new()) == NULL) 669 return (B_FALSE); 670 if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", 671 sysname)) == NULL) { 672 udev_device_unref(dev); 673 return (B_FALSE); 674 } 675 676 /* Sanity check some udev values */ 677 boolean_t is_sane = is_mpath_udev_sane(dev); 678 udev_device_unref(dev); 679 680 return (is_sane); 681 } 682 683 #else /* HAVE_LIBUDEV */ 684 685 boolean_t 686 is_mpath_whole_disk(const char *path) 687 { 688 (void) path; 689 return (B_FALSE); 690 } 691 692 #endif /* HAVE_LIBUDEV */ 693