1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2016, 2017 Intel Corporation.
26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27  */
28 
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  * 	Device vdevs
34  * 		disk=(path=..., devid=...)
35  * 		file=(path=...)
36  *
37  * 	Group vdevs
38  * 		raidz[1|2]=(...)
39  * 		mirror=(...)
40  *
41  * 	Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'make_root_vdev'.  The
52  * function performs several passes:
53  *
54  * 	1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  * 	2. Check for devices in use.  Using libblkid to make sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  * 	3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  * 	4. Call libzfs to label any whole disks with an EFI label.
63  */
64 
65 #include <assert.h>
66 #include <ctype.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <libintl.h>
70 #include <libnvpair.h>
71 #include <libzutil.h>
72 #include <limits.h>
73 #include <sys/spa.h>
74 #include <stdio.h>
75 #include <string.h>
76 #include <unistd.h>
77 #include "zpool_util.h"
78 #include <sys/zfs_context.h>
79 
80 #include <scsi/scsi.h>
81 #include <scsi/sg.h>
82 #include <sys/efi_partition.h>
83 #include <sys/stat.h>
84 #include <sys/mntent.h>
85 #include <uuid/uuid.h>
86 #include <blkid/blkid.h>
87 
88 typedef struct vdev_disk_db_entry
89 {
90 	char id[24];
91 	int sector_size;
92 } vdev_disk_db_entry_t;
93 
94 /*
95  * Database of block devices that lie about physical sector sizes.  The
96  * identification string must be precisely 24 characters to avoid false
97  * negatives
98  */
99 static vdev_disk_db_entry_t vdev_disk_database[] = {
100 	{"ATA     ADATA SSD S396 3", 8192},
101 	{"ATA     APPLE SSD SM128E", 8192},
102 	{"ATA     APPLE SSD SM256E", 8192},
103 	{"ATA     APPLE SSD SM512E", 8192},
104 	{"ATA     APPLE SSD SM768E", 8192},
105 	{"ATA     C400-MTFDDAC064M", 8192},
106 	{"ATA     C400-MTFDDAC128M", 8192},
107 	{"ATA     C400-MTFDDAC256M", 8192},
108 	{"ATA     C400-MTFDDAC512M", 8192},
109 	{"ATA     Corsair Force 3 ", 8192},
110 	{"ATA     Corsair Force GS", 8192},
111 	{"ATA     INTEL SSDSA2CT04", 8192},
112 	{"ATA     INTEL SSDSA2BZ10", 8192},
113 	{"ATA     INTEL SSDSA2BZ20", 8192},
114 	{"ATA     INTEL SSDSA2BZ30", 8192},
115 	{"ATA     INTEL SSDSA2CW04", 8192},
116 	{"ATA     INTEL SSDSA2CW08", 8192},
117 	{"ATA     INTEL SSDSA2CW12", 8192},
118 	{"ATA     INTEL SSDSA2CW16", 8192},
119 	{"ATA     INTEL SSDSA2CW30", 8192},
120 	{"ATA     INTEL SSDSA2CW60", 8192},
121 	{"ATA     INTEL SSDSC2CT06", 8192},
122 	{"ATA     INTEL SSDSC2CT12", 8192},
123 	{"ATA     INTEL SSDSC2CT18", 8192},
124 	{"ATA     INTEL SSDSC2CT24", 8192},
125 	{"ATA     INTEL SSDSC2CW06", 8192},
126 	{"ATA     INTEL SSDSC2CW12", 8192},
127 	{"ATA     INTEL SSDSC2CW18", 8192},
128 	{"ATA     INTEL SSDSC2CW24", 8192},
129 	{"ATA     INTEL SSDSC2CW48", 8192},
130 	{"ATA     KINGSTON SH100S3", 8192},
131 	{"ATA     KINGSTON SH103S3", 8192},
132 	{"ATA     M4-CT064M4SSD2  ", 8192},
133 	{"ATA     M4-CT128M4SSD2  ", 8192},
134 	{"ATA     M4-CT256M4SSD2  ", 8192},
135 	{"ATA     M4-CT512M4SSD2  ", 8192},
136 	{"ATA     OCZ-AGILITY2    ", 8192},
137 	{"ATA     OCZ-AGILITY3    ", 8192},
138 	{"ATA     OCZ-VERTEX2 3.5 ", 8192},
139 	{"ATA     OCZ-VERTEX3     ", 8192},
140 	{"ATA     OCZ-VERTEX3 LT  ", 8192},
141 	{"ATA     OCZ-VERTEX3 MI  ", 8192},
142 	{"ATA     OCZ-VERTEX4     ", 8192},
143 	{"ATA     SAMSUNG MZ7WD120", 8192},
144 	{"ATA     SAMSUNG MZ7WD240", 8192},
145 	{"ATA     SAMSUNG MZ7WD480", 8192},
146 	{"ATA     SAMSUNG MZ7WD960", 8192},
147 	{"ATA     SAMSUNG SSD 830 ", 8192},
148 	{"ATA     Samsung SSD 840 ", 8192},
149 	{"ATA     SanDisk SSD U100", 8192},
150 	{"ATA     TOSHIBA THNSNH06", 8192},
151 	{"ATA     TOSHIBA THNSNH12", 8192},
152 	{"ATA     TOSHIBA THNSNH25", 8192},
153 	{"ATA     TOSHIBA THNSNH51", 8192},
154 	{"ATA     APPLE SSD TS064C", 4096},
155 	{"ATA     APPLE SSD TS128C", 4096},
156 	{"ATA     APPLE SSD TS256C", 4096},
157 	{"ATA     APPLE SSD TS512C", 4096},
158 	{"ATA     INTEL SSDSA2M040", 4096},
159 	{"ATA     INTEL SSDSA2M080", 4096},
160 	{"ATA     INTEL SSDSA2M160", 4096},
161 	{"ATA     INTEL SSDSC2MH12", 4096},
162 	{"ATA     INTEL SSDSC2MH25", 4096},
163 	{"ATA     OCZ CORE_SSD    ", 4096},
164 	{"ATA     OCZ-VERTEX      ", 4096},
165 	{"ATA     SAMSUNG MCCOE32G", 4096},
166 	{"ATA     SAMSUNG MCCOE64G", 4096},
167 	{"ATA     SAMSUNG SSD PM80", 4096},
168 	/* Flash drives optimized for 4KB IOs on larger pages */
169 	{"ATA     INTEL SSDSC2BA10", 4096},
170 	{"ATA     INTEL SSDSC2BA20", 4096},
171 	{"ATA     INTEL SSDSC2BA40", 4096},
172 	{"ATA     INTEL SSDSC2BA80", 4096},
173 	{"ATA     INTEL SSDSC2BB08", 4096},
174 	{"ATA     INTEL SSDSC2BB12", 4096},
175 	{"ATA     INTEL SSDSC2BB16", 4096},
176 	{"ATA     INTEL SSDSC2BB24", 4096},
177 	{"ATA     INTEL SSDSC2BB30", 4096},
178 	{"ATA     INTEL SSDSC2BB40", 4096},
179 	{"ATA     INTEL SSDSC2BB48", 4096},
180 	{"ATA     INTEL SSDSC2BB60", 4096},
181 	{"ATA     INTEL SSDSC2BB80", 4096},
182 	{"ATA     INTEL SSDSC2BW24", 4096},
183 	{"ATA     INTEL SSDSC2BW48", 4096},
184 	{"ATA     INTEL SSDSC2BP24", 4096},
185 	{"ATA     INTEL SSDSC2BP48", 4096},
186 	{"NA      SmrtStorSDLKAE9W", 4096},
187 	{"NVMe    Amazon EC2 NVMe ", 4096},
188 	/* Imported from Open Solaris */
189 	{"ATA     MARVELL SD88SA02", 4096},
190 	/* Advanced format Hard drives */
191 	{"ATA     Hitachi HDS5C303", 4096},
192 	{"ATA     SAMSUNG HD204UI ", 4096},
193 	{"ATA     ST2000DL004 HD20", 4096},
194 	{"ATA     WDC WD10EARS-00M", 4096},
195 	{"ATA     WDC WD10EARS-00S", 4096},
196 	{"ATA     WDC WD10EARS-00Z", 4096},
197 	{"ATA     WDC WD15EARS-00M", 4096},
198 	{"ATA     WDC WD15EARS-00S", 4096},
199 	{"ATA     WDC WD15EARS-00Z", 4096},
200 	{"ATA     WDC WD20EARS-00M", 4096},
201 	{"ATA     WDC WD20EARS-00S", 4096},
202 	{"ATA     WDC WD20EARS-00Z", 4096},
203 	{"ATA     WDC WD1600BEVT-0", 4096},
204 	{"ATA     WDC WD2500BEVT-0", 4096},
205 	{"ATA     WDC WD3200BEVT-0", 4096},
206 	{"ATA     WDC WD5000BEVT-0", 4096},
207 };
208 
209 
210 #define	INQ_REPLY_LEN	96
211 #define	INQ_CMD_LEN	6
212 
213 static const int vdev_disk_database_size =
214 	sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
215 
216 boolean_t
217 check_sector_size_database(char *path, int *sector_size)
218 {
219 	unsigned char inq_buff[INQ_REPLY_LEN];
220 	unsigned char sense_buffer[32];
221 	unsigned char inq_cmd_blk[INQ_CMD_LEN] =
222 	    {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
223 	sg_io_hdr_t io_hdr;
224 	int error;
225 	int fd;
226 	int i;
227 
228 	/* Prepare INQUIRY command */
229 	memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
230 	io_hdr.interface_id = 'S';
231 	io_hdr.cmd_len = sizeof (inq_cmd_blk);
232 	io_hdr.mx_sb_len = sizeof (sense_buffer);
233 	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
234 	io_hdr.dxfer_len = INQ_REPLY_LEN;
235 	io_hdr.dxferp = inq_buff;
236 	io_hdr.cmdp = inq_cmd_blk;
237 	io_hdr.sbp = sense_buffer;
238 	io_hdr.timeout = 10;		/* 10 milliseconds is ample time */
239 
240 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
241 		return (B_FALSE);
242 
243 	error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
244 
245 	(void) close(fd);
246 
247 	if (error < 0)
248 		return (B_FALSE);
249 
250 	if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
251 		return (B_FALSE);
252 
253 	for (i = 0; i < vdev_disk_database_size; i++) {
254 		if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
255 			continue;
256 
257 		*sector_size = vdev_disk_database[i].sector_size;
258 		return (B_TRUE);
259 	}
260 
261 	return (B_FALSE);
262 }
263 
264 static int
265 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
266 {
267 	int err;
268 	char *value;
269 
270 	/* No valid type detected device is safe to use */
271 	value = blkid_get_tag_value(cache, "TYPE", path);
272 	if (value == NULL)
273 		return (0);
274 
275 	/*
276 	 * If libblkid detects a ZFS device, we check the device
277 	 * using check_file() to see if it's safe.  The one safe
278 	 * case is a spare device shared between multiple pools.
279 	 */
280 	if (strcmp(value, "zfs_member") == 0) {
281 		err = check_file(path, force, isspare);
282 	} else {
283 		if (force) {
284 			err = 0;
285 		} else {
286 			err = -1;
287 			vdev_error(gettext("%s contains a filesystem of "
288 			    "type '%s'\n"), path, value);
289 		}
290 	}
291 
292 	free(value);
293 
294 	return (err);
295 }
296 
297 /*
298  * Validate that a disk including all partitions are safe to use.
299  *
300  * For EFI labeled disks this can done relatively easily with the libefi
301  * library.  The partition numbers are extracted from the label and used
302  * to generate the expected /dev/ paths.  Each partition can then be
303  * checked for conflicts.
304  *
305  * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
306  * but due to the lack of a readily available libraries this scanning is
307  * not implemented.  Instead only the device path as given is checked.
308  */
309 static int
310 check_disk(const char *path, blkid_cache cache, int force,
311     boolean_t isspare, boolean_t iswholedisk)
312 {
313 	struct dk_gpt *vtoc;
314 	char slice_path[MAXPATHLEN];
315 	int err = 0;
316 	int fd, i;
317 	int flags = O_RDONLY|O_DIRECT;
318 
319 	if (!iswholedisk)
320 		return (check_slice(path, cache, force, isspare));
321 
322 	/* only spares can be shared, other devices require exclusive access */
323 	if (!isspare)
324 		flags |= O_EXCL;
325 
326 	if ((fd = open(path, flags)) < 0) {
327 		char *value = blkid_get_tag_value(cache, "TYPE", path);
328 		(void) fprintf(stderr, gettext("%s is in use and contains "
329 		    "a %s filesystem.\n"), path, value ? value : "unknown");
330 		free(value);
331 		return (-1);
332 	}
333 
334 	/*
335 	 * Expected to fail for non-EFI labeled disks.  Just check the device
336 	 * as given and do not attempt to detect and scan partitions.
337 	 */
338 	err = efi_alloc_and_read(fd, &vtoc);
339 	if (err) {
340 		(void) close(fd);
341 		return (check_slice(path, cache, force, isspare));
342 	}
343 
344 	/*
345 	 * The primary efi partition label is damaged however the secondary
346 	 * label at the end of the device is intact.  Rather than use this
347 	 * label we should play it safe and treat this as a non efi device.
348 	 */
349 	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
350 		efi_free(vtoc);
351 		(void) close(fd);
352 
353 		if (force) {
354 			/* Partitions will now be created using the backup */
355 			return (0);
356 		} else {
357 			vdev_error(gettext("%s contains a corrupt primary "
358 			    "EFI label.\n"), path);
359 			return (-1);
360 		}
361 	}
362 
363 	for (i = 0; i < vtoc->efi_nparts; i++) {
364 
365 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
366 		    uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
367 			continue;
368 
369 		if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
370 			(void) snprintf(slice_path, sizeof (slice_path),
371 			    "%s%s%d", path, "-part", i+1);
372 		else
373 			(void) snprintf(slice_path, sizeof (slice_path),
374 			    "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
375 			    "p" : "", i+1);
376 
377 		err = check_slice(slice_path, cache, force, isspare);
378 		if (err)
379 			break;
380 	}
381 
382 	efi_free(vtoc);
383 	(void) close(fd);
384 
385 	return (err);
386 }
387 
388 int
389 check_device(const char *path, boolean_t force,
390     boolean_t isspare, boolean_t iswholedisk)
391 {
392 	blkid_cache cache;
393 	int error;
394 
395 	error = blkid_get_cache(&cache, NULL);
396 	if (error != 0) {
397 		(void) fprintf(stderr, gettext("unable to access the blkid "
398 		    "cache.\n"));
399 		return (-1);
400 	}
401 
402 	error = check_disk(path, cache, force, isspare, iswholedisk);
403 	blkid_put_cache(cache);
404 
405 	return (error);
406 }
407 
408 void
409 after_zpool_upgrade(zpool_handle_t *zhp)
410 {
411 	(void) zhp;
412 }
413 
414 int
415 check_file(const char *file, boolean_t force, boolean_t isspare)
416 {
417 	return (check_file_generic(file, force, isspare));
418 }
419