1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2016, 2017 Intel Corporation. 26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 27 */ 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libblkid to make sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <ctype.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libintl.h> 70 #include <libnvpair.h> 71 #include <libzutil.h> 72 #include <limits.h> 73 #include <sys/spa.h> 74 #include <stdio.h> 75 #include <string.h> 76 #include <unistd.h> 77 #include "zpool_util.h" 78 #include <sys/zfs_context.h> 79 80 #include <scsi/scsi.h> 81 #include <scsi/sg.h> 82 #include <sys/efi_partition.h> 83 #include <sys/stat.h> 84 #include <sys/vtoc.h> 85 #include <sys/mntent.h> 86 #include <uuid/uuid.h> 87 #include <blkid/blkid.h> 88 89 typedef struct vdev_disk_db_entry 90 { 91 char id[24]; 92 int sector_size; 93 } vdev_disk_db_entry_t; 94 95 /* 96 * Database of block devices that lie about physical sector sizes. The 97 * identification string must be precisely 24 characters to avoid false 98 * negatives 99 */ 100 static vdev_disk_db_entry_t vdev_disk_database[] = { 101 {"ATA ADATA SSD S396 3", 8192}, 102 {"ATA APPLE SSD SM128E", 8192}, 103 {"ATA APPLE SSD SM256E", 8192}, 104 {"ATA APPLE SSD SM512E", 8192}, 105 {"ATA APPLE SSD SM768E", 8192}, 106 {"ATA C400-MTFDDAC064M", 8192}, 107 {"ATA C400-MTFDDAC128M", 8192}, 108 {"ATA C400-MTFDDAC256M", 8192}, 109 {"ATA C400-MTFDDAC512M", 8192}, 110 {"ATA Corsair Force 3 ", 8192}, 111 {"ATA Corsair Force GS", 8192}, 112 {"ATA INTEL SSDSA2CT04", 8192}, 113 {"ATA INTEL SSDSA2BZ10", 8192}, 114 {"ATA INTEL SSDSA2BZ20", 8192}, 115 {"ATA INTEL SSDSA2BZ30", 8192}, 116 {"ATA INTEL SSDSA2CW04", 8192}, 117 {"ATA INTEL SSDSA2CW08", 8192}, 118 {"ATA INTEL SSDSA2CW12", 8192}, 119 {"ATA INTEL SSDSA2CW16", 8192}, 120 {"ATA INTEL SSDSA2CW30", 8192}, 121 {"ATA INTEL SSDSA2CW60", 8192}, 122 {"ATA INTEL SSDSC2CT06", 8192}, 123 {"ATA INTEL SSDSC2CT12", 8192}, 124 {"ATA INTEL SSDSC2CT18", 8192}, 125 {"ATA INTEL SSDSC2CT24", 8192}, 126 {"ATA INTEL SSDSC2CW06", 8192}, 127 {"ATA INTEL SSDSC2CW12", 8192}, 128 {"ATA INTEL SSDSC2CW18", 8192}, 129 {"ATA INTEL SSDSC2CW24", 8192}, 130 {"ATA INTEL SSDSC2CW48", 8192}, 131 {"ATA KINGSTON SH100S3", 8192}, 132 {"ATA KINGSTON SH103S3", 8192}, 133 {"ATA M4-CT064M4SSD2 ", 8192}, 134 {"ATA M4-CT128M4SSD2 ", 8192}, 135 {"ATA M4-CT256M4SSD2 ", 8192}, 136 {"ATA M4-CT512M4SSD2 ", 8192}, 137 {"ATA OCZ-AGILITY2 ", 8192}, 138 {"ATA OCZ-AGILITY3 ", 8192}, 139 {"ATA OCZ-VERTEX2 3.5 ", 8192}, 140 {"ATA OCZ-VERTEX3 ", 8192}, 141 {"ATA OCZ-VERTEX3 LT ", 8192}, 142 {"ATA OCZ-VERTEX3 MI ", 8192}, 143 {"ATA OCZ-VERTEX4 ", 8192}, 144 {"ATA SAMSUNG MZ7WD120", 8192}, 145 {"ATA SAMSUNG MZ7WD240", 8192}, 146 {"ATA SAMSUNG MZ7WD480", 8192}, 147 {"ATA SAMSUNG MZ7WD960", 8192}, 148 {"ATA SAMSUNG SSD 830 ", 8192}, 149 {"ATA Samsung SSD 840 ", 8192}, 150 {"ATA SanDisk SSD U100", 8192}, 151 {"ATA TOSHIBA THNSNH06", 8192}, 152 {"ATA TOSHIBA THNSNH12", 8192}, 153 {"ATA TOSHIBA THNSNH25", 8192}, 154 {"ATA TOSHIBA THNSNH51", 8192}, 155 {"ATA APPLE SSD TS064C", 4096}, 156 {"ATA APPLE SSD TS128C", 4096}, 157 {"ATA APPLE SSD TS256C", 4096}, 158 {"ATA APPLE SSD TS512C", 4096}, 159 {"ATA INTEL SSDSA2M040", 4096}, 160 {"ATA INTEL SSDSA2M080", 4096}, 161 {"ATA INTEL SSDSA2M160", 4096}, 162 {"ATA INTEL SSDSC2MH12", 4096}, 163 {"ATA INTEL SSDSC2MH25", 4096}, 164 {"ATA OCZ CORE_SSD ", 4096}, 165 {"ATA OCZ-VERTEX ", 4096}, 166 {"ATA SAMSUNG MCCOE32G", 4096}, 167 {"ATA SAMSUNG MCCOE64G", 4096}, 168 {"ATA SAMSUNG SSD PM80", 4096}, 169 /* Flash drives optimized for 4KB IOs on larger pages */ 170 {"ATA INTEL SSDSC2BA10", 4096}, 171 {"ATA INTEL SSDSC2BA20", 4096}, 172 {"ATA INTEL SSDSC2BA40", 4096}, 173 {"ATA INTEL SSDSC2BA80", 4096}, 174 {"ATA INTEL SSDSC2BB08", 4096}, 175 {"ATA INTEL SSDSC2BB12", 4096}, 176 {"ATA INTEL SSDSC2BB16", 4096}, 177 {"ATA INTEL SSDSC2BB24", 4096}, 178 {"ATA INTEL SSDSC2BB30", 4096}, 179 {"ATA INTEL SSDSC2BB40", 4096}, 180 {"ATA INTEL SSDSC2BB48", 4096}, 181 {"ATA INTEL SSDSC2BB60", 4096}, 182 {"ATA INTEL SSDSC2BB80", 4096}, 183 {"ATA INTEL SSDSC2BW24", 4096}, 184 {"ATA INTEL SSDSC2BW48", 4096}, 185 {"ATA INTEL SSDSC2BP24", 4096}, 186 {"ATA INTEL SSDSC2BP48", 4096}, 187 {"NA SmrtStorSDLKAE9W", 4096}, 188 {"NVMe Amazon EC2 NVMe ", 4096}, 189 /* Imported from Open Solaris */ 190 {"ATA MARVELL SD88SA02", 4096}, 191 /* Advanced format Hard drives */ 192 {"ATA Hitachi HDS5C303", 4096}, 193 {"ATA SAMSUNG HD204UI ", 4096}, 194 {"ATA ST2000DL004 HD20", 4096}, 195 {"ATA WDC WD10EARS-00M", 4096}, 196 {"ATA WDC WD10EARS-00S", 4096}, 197 {"ATA WDC WD10EARS-00Z", 4096}, 198 {"ATA WDC WD15EARS-00M", 4096}, 199 {"ATA WDC WD15EARS-00S", 4096}, 200 {"ATA WDC WD15EARS-00Z", 4096}, 201 {"ATA WDC WD20EARS-00M", 4096}, 202 {"ATA WDC WD20EARS-00S", 4096}, 203 {"ATA WDC WD20EARS-00Z", 4096}, 204 {"ATA WDC WD1600BEVT-0", 4096}, 205 {"ATA WDC WD2500BEVT-0", 4096}, 206 {"ATA WDC WD3200BEVT-0", 4096}, 207 {"ATA WDC WD5000BEVT-0", 4096}, 208 }; 209 210 211 #define INQ_REPLY_LEN 96 212 #define INQ_CMD_LEN 6 213 214 static const int vdev_disk_database_size = 215 sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]); 216 217 boolean_t 218 check_sector_size_database(char *path, int *sector_size) 219 { 220 unsigned char inq_buff[INQ_REPLY_LEN]; 221 unsigned char sense_buffer[32]; 222 unsigned char inq_cmd_blk[INQ_CMD_LEN] = 223 {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0}; 224 sg_io_hdr_t io_hdr; 225 int error; 226 int fd; 227 int i; 228 229 /* Prepare INQUIRY command */ 230 memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); 231 io_hdr.interface_id = 'S'; 232 io_hdr.cmd_len = sizeof (inq_cmd_blk); 233 io_hdr.mx_sb_len = sizeof (sense_buffer); 234 io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; 235 io_hdr.dxfer_len = INQ_REPLY_LEN; 236 io_hdr.dxferp = inq_buff; 237 io_hdr.cmdp = inq_cmd_blk; 238 io_hdr.sbp = sense_buffer; 239 io_hdr.timeout = 10; /* 10 milliseconds is ample time */ 240 241 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) 242 return (B_FALSE); 243 244 error = ioctl(fd, SG_IO, (unsigned long) &io_hdr); 245 246 (void) close(fd); 247 248 if (error < 0) 249 return (B_FALSE); 250 251 if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) 252 return (B_FALSE); 253 254 for (i = 0; i < vdev_disk_database_size; i++) { 255 if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24)) 256 continue; 257 258 *sector_size = vdev_disk_database[i].sector_size; 259 return (B_TRUE); 260 } 261 262 return (B_FALSE); 263 } 264 265 static int 266 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) 267 { 268 int err; 269 char *value; 270 271 /* No valid type detected device is safe to use */ 272 value = blkid_get_tag_value(cache, "TYPE", path); 273 if (value == NULL) 274 return (0); 275 276 /* 277 * If libblkid detects a ZFS device, we check the device 278 * using check_file() to see if it's safe. The one safe 279 * case is a spare device shared between multiple pools. 280 */ 281 if (strcmp(value, "zfs_member") == 0) { 282 err = check_file(path, force, isspare); 283 } else { 284 if (force) { 285 err = 0; 286 } else { 287 err = -1; 288 vdev_error(gettext("%s contains a filesystem of " 289 "type '%s'\n"), path, value); 290 } 291 } 292 293 free(value); 294 295 return (err); 296 } 297 298 /* 299 * Validate that a disk including all partitions are safe to use. 300 * 301 * For EFI labeled disks this can done relatively easily with the libefi 302 * library. The partition numbers are extracted from the label and used 303 * to generate the expected /dev/ paths. Each partition can then be 304 * checked for conflicts. 305 * 306 * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible 307 * but due to the lack of a readily available libraries this scanning is 308 * not implemented. Instead only the device path as given is checked. 309 */ 310 static int 311 check_disk(const char *path, blkid_cache cache, int force, 312 boolean_t isspare, boolean_t iswholedisk) 313 { 314 struct dk_gpt *vtoc; 315 char slice_path[MAXPATHLEN]; 316 int err = 0; 317 int fd, i; 318 int flags = O_RDONLY|O_DIRECT; 319 320 if (!iswholedisk) 321 return (check_slice(path, cache, force, isspare)); 322 323 /* only spares can be shared, other devices require exclusive access */ 324 if (!isspare) 325 flags |= O_EXCL; 326 327 if ((fd = open(path, flags)) < 0) { 328 char *value = blkid_get_tag_value(cache, "TYPE", path); 329 (void) fprintf(stderr, gettext("%s is in use and contains " 330 "a %s filesystem.\n"), path, value ? value : "unknown"); 331 free(value); 332 return (-1); 333 } 334 335 /* 336 * Expected to fail for non-EFI labeled disks. Just check the device 337 * as given and do not attempt to detect and scan partitions. 338 */ 339 err = efi_alloc_and_read(fd, &vtoc); 340 if (err) { 341 (void) close(fd); 342 return (check_slice(path, cache, force, isspare)); 343 } 344 345 /* 346 * The primary efi partition label is damaged however the secondary 347 * label at the end of the device is intact. Rather than use this 348 * label we should play it safe and treat this as a non efi device. 349 */ 350 if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { 351 efi_free(vtoc); 352 (void) close(fd); 353 354 if (force) { 355 /* Partitions will now be created using the backup */ 356 return (0); 357 } else { 358 vdev_error(gettext("%s contains a corrupt primary " 359 "EFI label.\n"), path); 360 return (-1); 361 } 362 } 363 364 for (i = 0; i < vtoc->efi_nparts; i++) { 365 366 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED || 367 uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) 368 continue; 369 370 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) 371 (void) snprintf(slice_path, sizeof (slice_path), 372 "%s%s%d", path, "-part", i+1); 373 else 374 (void) snprintf(slice_path, sizeof (slice_path), 375 "%s%s%d", path, isdigit(path[strlen(path)-1]) ? 376 "p" : "", i+1); 377 378 err = check_slice(slice_path, cache, force, isspare); 379 if (err) 380 break; 381 } 382 383 efi_free(vtoc); 384 (void) close(fd); 385 386 return (err); 387 } 388 389 int 390 check_device(const char *path, boolean_t force, 391 boolean_t isspare, boolean_t iswholedisk) 392 { 393 blkid_cache cache; 394 int error; 395 396 error = blkid_get_cache(&cache, NULL); 397 if (error != 0) { 398 (void) fprintf(stderr, gettext("unable to access the blkid " 399 "cache.\n")); 400 return (-1); 401 } 402 403 error = check_disk(path, cache, force, isspare, iswholedisk); 404 blkid_put_cache(cache); 405 406 return (error); 407 } 408 409 void 410 after_zpool_upgrade(zpool_handle_t *zhp) 411 { 412 (void) zhp; 413 } 414 415 int 416 check_file(const char *file, boolean_t force, boolean_t isspare) 417 { 418 return (check_file_generic(file, force, isspare)); 419 } 420