1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <ctype.h> 27 #include <dirent.h> 28 #include <fcntl.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/efi_partition.h> 33 34 #ifdef HAVE_LIBUDEV 35 #include <libudev.h> 36 #endif 37 38 #include <libzutil.h> 39 40 /* 41 * Append partition suffix to an otherwise fully qualified device path. 42 * This is used to generate the name the full path as its stored in 43 * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length 44 * of 'path' will be returned on error a negative value is returned. 45 */ 46 int 47 zfs_append_partition(char *path, size_t max_len) 48 { 49 int len = strlen(path); 50 51 if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || 52 (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { 53 if (len + 6 >= max_len) 54 return (-1); 55 56 (void) strcat(path, "-part1"); 57 len += 6; 58 } else { 59 if (len + 2 >= max_len) 60 return (-1); 61 62 if (isdigit(path[len-1])) { 63 (void) strcat(path, "p1"); 64 len += 2; 65 } else { 66 (void) strcat(path, "1"); 67 len += 1; 68 } 69 } 70 71 return (len); 72 } 73 74 /* 75 * Remove partition suffix from a vdev path. Partition suffixes may take three 76 * forms: "-partX", "pX", or "X", where X is a string of digits. The second 77 * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The 78 * third case only occurs when preceded by a string matching the regular 79 * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. 80 * 81 * caller must free the returned string 82 */ 83 char * 84 zfs_strip_partition(char *path) 85 { 86 char *tmp = strdup(path); 87 char *part = NULL, *d = NULL; 88 if (!tmp) 89 return (NULL); 90 91 if ((part = strstr(tmp, "-part")) && part != tmp) { 92 d = part + 5; 93 } else if ((part = strrchr(tmp, 'p')) && 94 part > tmp + 1 && isdigit(*(part-1))) { 95 d = part + 1; 96 } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && 97 tmp[1] == 'd') { 98 for (d = &tmp[2]; isalpha(*d); part = ++d) { } 99 } else if (strncmp("xvd", tmp, 3) == 0) { 100 for (d = &tmp[3]; isalpha(*d); part = ++d) { } 101 } 102 if (part && d && *d != '\0') { 103 for (; isdigit(*d); d++) { } 104 if (*d == '\0') 105 *part = '\0'; 106 } 107 108 return (tmp); 109 } 110 111 /* 112 * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname 113 * 114 * path: /dev/sda1 115 * returns: /dev/sda 116 * 117 * Returned string must be freed. 118 */ 119 static char * 120 zfs_strip_partition_path(char *path) 121 { 122 char *newpath = strdup(path); 123 char *sd_offset; 124 char *new_sd; 125 126 if (!newpath) 127 return (NULL); 128 129 /* Point to "sda1" part of "/dev/sda1" */ 130 sd_offset = strrchr(newpath, '/') + 1; 131 132 /* Get our new name "sda" */ 133 new_sd = zfs_strip_partition(sd_offset); 134 if (!new_sd) { 135 free(newpath); 136 return (NULL); 137 } 138 139 /* Paste the "sda" where "sda1" was */ 140 strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); 141 142 /* Free temporary "sda" */ 143 free(new_sd); 144 145 return (newpath); 146 } 147 148 /* 149 * Strip the unwanted portion of a device path. 150 */ 151 char * 152 zfs_strip_path(char *path) 153 { 154 return (strrchr(path, '/') + 1); 155 } 156 157 /* 158 * Read the contents of a sysfs file into an allocated buffer and remove the 159 * last newline. 160 * 161 * This is useful for reading sysfs files that return a single string. Return 162 * an allocated string pointer on success, NULL otherwise. Returned buffer 163 * must be freed by the user. 164 */ 165 static char * 166 zfs_read_sysfs_file(char *filepath) 167 { 168 char buf[4096]; /* all sysfs files report 4k size */ 169 char *str = NULL; 170 171 FILE *fp = fopen(filepath, "r"); 172 if (fp == NULL) { 173 return (NULL); 174 } 175 if (fgets(buf, sizeof (buf), fp) == buf) { 176 /* success */ 177 178 /* Remove the last newline (if any) */ 179 size_t len = strlen(buf); 180 if (buf[len - 1] == '\n') { 181 buf[len - 1] = '\0'; 182 } 183 str = strdup(buf); 184 } 185 186 fclose(fp); 187 188 return (str); 189 } 190 191 /* 192 * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to 193 * the drive (in /sys/bus/pci/slots). 194 * 195 * For example: 196 * dev: "nvme0n1" 197 * returns: "/sys/bus/pci/slots/0" 198 * 199 * 'dev' must be an NVMe device. 200 * 201 * Returned string must be freed. Returns NULL on error or no sysfs path. 202 */ 203 static char * 204 zfs_get_pci_slots_sys_path(const char *dev_name) 205 { 206 DIR *dp = NULL; 207 struct dirent *ep; 208 char *address1 = NULL; 209 char *address2 = NULL; 210 char *path = NULL; 211 char buf[MAXPATHLEN]; 212 char *tmp; 213 214 /* If they preface 'dev' with a path (like "/dev") then strip it off */ 215 tmp = strrchr(dev_name, '/'); 216 if (tmp != NULL) 217 dev_name = tmp + 1; /* +1 since we want the chr after '/' */ 218 219 if (strncmp("nvme", dev_name, 4) != 0) 220 return (NULL); 221 222 (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", 223 dev_name); 224 225 address1 = zfs_read_sysfs_file(buf); 226 if (!address1) 227 return (NULL); 228 229 /* 230 * /sys/block/nvme0n1/device/address format will 231 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be 232 * "0000:01:00". Just NULL terminate at the '.' so they match. 233 */ 234 tmp = strrchr(address1, '.'); 235 if (tmp != NULL) 236 *tmp = '\0'; 237 238 dp = opendir("/sys/bus/pci/slots/"); 239 if (dp == NULL) { 240 free(address1); 241 return (NULL); 242 } 243 244 /* 245 * Look through all the /sys/bus/pci/slots/ subdirs 246 */ 247 while ((ep = readdir(dp))) { 248 /* 249 * We only care about directory names that are a single number. 250 * Sometimes there's other directories like 251 * "/sys/bus/pci/slots/0-3/" in there - skip those. 252 */ 253 if (!zfs_isnumber(ep->d_name)) 254 continue; 255 256 (void) snprintf(buf, sizeof (buf), 257 "/sys/bus/pci/slots/%s/address", ep->d_name); 258 259 address2 = zfs_read_sysfs_file(buf); 260 if (!address2) 261 continue; 262 263 if (strcmp(address1, address2) == 0) { 264 /* Addresses match, we're all done */ 265 free(address2); 266 if (asprintf(&path, "/sys/bus/pci/slots/%s", 267 ep->d_name) == -1) { 268 free(tmp); 269 continue; 270 } 271 break; 272 } 273 free(address2); 274 } 275 276 closedir(dp); 277 free(address1); 278 279 return (path); 280 } 281 282 /* 283 * Given a dev name like "sda", return the full enclosure sysfs path to 284 * the disk. You can also pass in the name with "/dev" prepended 285 * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. 286 * 287 * For example, disk "sda" in enclosure slot 1: 288 * dev_name: "sda" 289 * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" 290 * 291 * Or: 292 * 293 * dev_name: "nvme0n1" 294 * returns: "/sys/bus/pci/slots/0" 295 * 296 * 'dev' must be a non-devicemapper device. 297 * 298 * Returned string must be freed. Returns NULL on error. 299 */ 300 char * 301 zfs_get_enclosure_sysfs_path(const char *dev_name) 302 { 303 DIR *dp = NULL; 304 struct dirent *ep; 305 char buf[MAXPATHLEN]; 306 char *tmp1 = NULL; 307 char *tmp2 = NULL; 308 char *tmp3 = NULL; 309 char *path = NULL; 310 size_t size; 311 int tmpsize; 312 313 if (dev_name == NULL) 314 return (NULL); 315 316 /* If they preface 'dev' with a path (like "/dev") then strip it off */ 317 tmp1 = strrchr(dev_name, '/'); 318 if (tmp1 != NULL) 319 dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ 320 321 tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); 322 if (tmpsize == -1 || tmp1 == NULL) { 323 tmp1 = NULL; 324 goto end; 325 } 326 327 dp = opendir(tmp1); 328 if (dp == NULL) 329 goto end; 330 331 /* 332 * Look though all sysfs entries in /sys/block/<dev>/device for 333 * the enclosure symlink. 334 */ 335 while ((ep = readdir(dp))) { 336 /* Ignore everything that's not our enclosure_device link */ 337 if (strstr(ep->d_name, "enclosure_device") == NULL) 338 continue; 339 340 if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) { 341 tmp2 = NULL; 342 break; 343 } 344 345 size = readlink(tmp2, buf, sizeof (buf)); 346 347 /* Did readlink fail or crop the link name? */ 348 if (size == -1 || size >= sizeof (buf)) 349 break; 350 351 /* 352 * We got a valid link. readlink() doesn't terminate strings 353 * so we have to do it. 354 */ 355 buf[size] = '\0'; 356 357 /* 358 * Our link will look like: 359 * 360 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" 361 * 362 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part 363 */ 364 tmp3 = strstr(buf, "enclosure"); 365 if (tmp3 == NULL) 366 break; 367 368 if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { 369 /* If asprintf() fails, 'path' is undefined */ 370 path = NULL; 371 break; 372 } 373 374 if (path == NULL) 375 break; 376 } 377 378 end: 379 free(tmp2); 380 free(tmp1); 381 382 if (dp != NULL) 383 closedir(dp); 384 385 if (!path) { 386 /* 387 * This particular disk isn't in a JBOD. It could be an NVMe 388 * drive. If so, look up the NVMe device's path in 389 * /sys/bus/pci/slots/. Within that directory is a 'attention' 390 * file which controls the NVMe fault LED. 391 */ 392 path = zfs_get_pci_slots_sys_path(dev_name); 393 } 394 395 return (path); 396 } 397 398 /* 399 * Allocate and return the underlying device name for a device mapper device. 400 * 401 * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a 402 * DM device (like /dev/disk/by-vdev/A0) are also allowed. 403 * 404 * If the DM device has multiple underlying devices (like with multipath 405 * DM devices), then favor underlying devices that have a symlink back to their 406 * back to their enclosure device in sysfs. This will be useful for the 407 * zedlet scripts that toggle the fault LED. 408 * 409 * Returns an underlying device name, or NULL on error or no match. If dm_name 410 * is not a DM device then return NULL. 411 * 412 * NOTE: The returned name string must be *freed*. 413 */ 414 static char * 415 dm_get_underlying_path(const char *dm_name) 416 { 417 DIR *dp = NULL; 418 struct dirent *ep; 419 char *realp; 420 char *tmp = NULL; 421 char *path = NULL; 422 char *dev_str; 423 int size; 424 char *first_path = NULL; 425 char *enclosure_path; 426 427 if (dm_name == NULL) 428 return (NULL); 429 430 /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ 431 realp = realpath(dm_name, NULL); 432 if (realp == NULL) 433 return (NULL); 434 435 /* 436 * If they preface 'dev' with a path (like "/dev") then strip it off. 437 * We just want the 'dm-N' part. 438 */ 439 tmp = strrchr(realp, '/'); 440 if (tmp != NULL) 441 dev_str = tmp + 1; /* +1 since we want the chr after '/' */ 442 else 443 dev_str = tmp; 444 445 if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) { 446 tmp = NULL; 447 goto end; 448 } 449 450 dp = opendir(tmp); 451 if (dp == NULL) 452 goto end; 453 454 /* 455 * A device-mapper device can have multiple paths to it (multipath). 456 * Favor paths that have a symlink back to their enclosure device. 457 * We have to do this since some enclosures may only provide a symlink 458 * back for one underlying path to a disk and not the other. 459 * 460 * If no paths have links back to their enclosure, then just return the 461 * first path. 462 */ 463 while ((ep = readdir(dp))) { 464 if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ 465 if (!first_path) 466 first_path = strdup(ep->d_name); 467 468 enclosure_path = 469 zfs_get_enclosure_sysfs_path(ep->d_name); 470 471 if (!enclosure_path) 472 continue; 473 474 if ((size = asprintf( 475 &path, "/dev/%s", ep->d_name)) == -1) 476 path = NULL; 477 free(enclosure_path); 478 break; 479 } 480 } 481 482 end: 483 if (dp != NULL) 484 closedir(dp); 485 free(tmp); 486 free(realp); 487 488 if (!path && first_path) { 489 /* 490 * None of the underlying paths had a link back to their 491 * enclosure devices. Throw up out hands and return the first 492 * underlying path. 493 */ 494 if ((size = asprintf(&path, "/dev/%s", first_path)) == -1) 495 path = NULL; 496 } 497 498 free(first_path); 499 return (path); 500 } 501 502 /* 503 * Return B_TRUE if device is a device mapper or multipath device. 504 * Return B_FALSE if not. 505 */ 506 boolean_t 507 zfs_dev_is_dm(const char *dev_name) 508 { 509 510 char *tmp; 511 tmp = dm_get_underlying_path(dev_name); 512 if (tmp == NULL) 513 return (B_FALSE); 514 515 free(tmp); 516 return (B_TRUE); 517 } 518 519 /* 520 * By "whole disk" we mean an entire physical disk (something we can 521 * label, toggle the write cache on, etc.) as opposed to the full 522 * capacity of a pseudo-device such as lofi or did. We act as if we 523 * are labeling the disk, which should be a pretty good test of whether 524 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 525 * it isn't. 526 */ 527 boolean_t 528 zfs_dev_is_whole_disk(const char *dev_name) 529 { 530 struct dk_gpt *label = NULL; 531 int fd; 532 533 if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) 534 return (B_FALSE); 535 536 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 537 (void) close(fd); 538 return (B_FALSE); 539 } 540 541 efi_free(label); 542 (void) close(fd); 543 544 return (B_TRUE); 545 } 546 547 /* 548 * Lookup the underlying device for a device name 549 * 550 * Often you'll have a symlink to a device, a partition device, 551 * or a multipath device, and want to look up the underlying device. 552 * This function returns the underlying device name. If the device 553 * name is already the underlying device, then just return the same 554 * name. If the device is a DM device with multiple underlying devices 555 * then return the first one. 556 * 557 * For example: 558 * 559 * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda 560 * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 561 * returns: /dev/sda 562 * 563 * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) 564 * dev_name: /dev/mapper/mpatha 565 * returns: /dev/sda (first device) 566 * 567 * 3. /dev/sda (already the underlying device) 568 * dev_name: /dev/sda 569 * returns: /dev/sda 570 * 571 * 4. /dev/dm-3 (mapped to /dev/sda) 572 * dev_name: /dev/dm-3 573 * returns: /dev/sda 574 * 575 * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 576 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 577 * returns: /dev/sdb 578 * 579 * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 580 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a 581 * returns: /dev/sda 582 * 583 * Returns underlying device name, or NULL on error or no match. 584 * 585 * NOTE: The returned name string must be *freed*. 586 */ 587 char * 588 zfs_get_underlying_path(const char *dev_name) 589 { 590 char *name = NULL; 591 char *tmp; 592 593 if (dev_name == NULL) 594 return (NULL); 595 596 tmp = dm_get_underlying_path(dev_name); 597 598 /* dev_name not a DM device, so just un-symlinkize it */ 599 if (tmp == NULL) 600 tmp = realpath(dev_name, NULL); 601 602 if (tmp != NULL) { 603 name = zfs_strip_partition_path(tmp); 604 free(tmp); 605 } 606 607 return (name); 608 } 609 610 611 #ifdef HAVE_LIBUDEV 612 613 /* 614 * A disk is considered a multipath whole disk when: 615 * DEVNAME key value has "dm-" 616 * DM_UUID key exists and starts with 'mpath-' 617 * ID_PART_TABLE_TYPE key does not exist or is not gpt 618 * ID_FS_LABEL key does not exist (disk isn't labeled) 619 */ 620 static boolean_t 621 is_mpath_udev_sane(struct udev_device *dev) 622 { 623 const char *devname, *type, *uuid, *label; 624 625 devname = udev_device_get_property_value(dev, "DEVNAME"); 626 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); 627 uuid = udev_device_get_property_value(dev, "DM_UUID"); 628 label = udev_device_get_property_value(dev, "ID_FS_LABEL"); 629 630 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && 631 ((type == NULL) || (strcmp(type, "gpt") != 0)) && 632 ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) && 633 (label == NULL)) { 634 return (B_TRUE); 635 } 636 637 return (B_FALSE); 638 } 639 640 /* 641 * Check if a disk is a multipath "blank" disk: 642 * 643 * 1. The disk has udev values that suggest it's a multipath disk 644 * 2. The disk is not currently labeled with a filesystem of any type 645 * 3. There are no partitions on the disk 646 */ 647 boolean_t 648 is_mpath_whole_disk(const char *path) 649 { 650 struct udev *udev; 651 struct udev_device *dev = NULL; 652 char nodepath[MAXPATHLEN]; 653 char *sysname; 654 655 if (realpath(path, nodepath) == NULL) 656 return (B_FALSE); 657 sysname = strrchr(nodepath, '/') + 1; 658 if (strncmp(sysname, "dm-", 3) != 0) 659 return (B_FALSE); 660 if ((udev = udev_new()) == NULL) 661 return (B_FALSE); 662 if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", 663 sysname)) == NULL) { 664 udev_device_unref(dev); 665 return (B_FALSE); 666 } 667 668 /* Sanity check some udev values */ 669 boolean_t is_sane = is_mpath_udev_sane(dev); 670 udev_device_unref(dev); 671 672 return (is_sane); 673 } 674 675 #else /* HAVE_LIBUDEV */ 676 677 boolean_t 678 is_mpath_whole_disk(const char *path) 679 { 680 (void) path; 681 return (B_FALSE); 682 } 683 684 #endif /* HAVE_LIBUDEV */ 685