1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
25  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
26  */
27 
28 /*
29  * Functions to convert between a list of vdevs and an nvlist representing the
30  * configuration.  Each entry in the list can be one of:
31  *
32  * 	Device vdevs
33  * 		disk=(path=..., devid=...)
34  * 		file=(path=...)
35  *
36  * 	Group vdevs
37  * 		raidz[1|2]=(...)
38  * 		mirror=(...)
39  *
40  * 	Hot spares
41  *
42  * While the underlying implementation supports it, group vdevs cannot contain
43  * other group vdevs.  All userland verification of devices is contained within
44  * this file.  If successful, the nvlist returned can be passed directly to the
45  * kernel; we've done as much verification as possible in userland.
46  *
47  * Hot spares are a special case, and passed down as an array of disk vdevs, at
48  * the same level as the root of the vdev tree.
49  *
50  * The only function exported by this file is 'make_root_vdev'.  The
51  * function performs several passes:
52  *
53  * 	1. Construct the vdev specification.  Performs syntax validation and
54  *         makes sure each device is valid.
55  * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
56  *         devices are also in use.  Some can be overridden using the 'force'
57  *         flag, others cannot.
58  * 	3. Check for replication errors if the 'force' flag is not specified.
59  *         validates that the replication level is consistent across the
60  *         entire pool.
61  * 	4. Call libzfs to label any whole disks with an EFI label.
62  */
63 
64 #include <assert.h>
65 #include <devid.h>
66 #include <errno.h>
67 #include <fcntl.h>
68 #include <libintl.h>
69 #include <libnvpair.h>
70 #include <limits.h>
71 #include <stdio.h>
72 #include <string.h>
73 #include <unistd.h>
74 #include <paths.h>
75 #include <sys/stat.h>
76 #include <sys/disk.h>
77 #include <sys/mntent.h>
78 #ifdef __FreeBSD__
79 #include <libgeom.h>
80 #endif
81 #ifdef __NetBSD__
82 #include <sys/disklabel.h>
83 #include <sys/ioctl.h>
84 #endif
85 
86 #include "zpool_util.h"
87 
88 #define	BACKUP_SLICE	"s2"
89 
90 /*
91  * For any given vdev specification, we can have multiple errors.  The
92  * vdev_error() function keeps track of whether we have seen an error yet, and
93  * prints out a header if its the first error we've seen.
94  */
95 boolean_t error_seen;
96 boolean_t is_force;
97 
98 /*PRINTFLIKE1*/
99 static void
vdev_error(const char * fmt,...)100 vdev_error(const char *fmt, ...)
101 {
102 	va_list ap;
103 
104 	if (!error_seen) {
105 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
106 		if (!is_force)
107 			(void) fprintf(stderr, gettext("use '-f' to override "
108 			    "the following errors:\n"));
109 		else
110 			(void) fprintf(stderr, gettext("the following errors "
111 			    "must be manually repaired:\n"));
112 		error_seen = B_TRUE;
113 	}
114 
115 	va_start(ap, fmt);
116 	(void) vfprintf(stderr, fmt, ap);
117 	va_end(ap);
118 }
119 
120 #ifdef illumos
121 static void
libdiskmgt_error(int error)122 libdiskmgt_error(int error)
123 {
124 	/*
125 	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
126 	 * /dev/dsk.  Don't bother printing an error message in this case.
127 	 */
128 	if (error == ENXIO || error == ENODEV)
129 		return;
130 
131 	(void) fprintf(stderr, gettext("warning: device in use checking "
132 	    "failed: %s\n"), strerror(error));
133 }
134 
135 /*
136  * Validate a device, passing the bulk of the work off to libdiskmgt.
137  */
138 static int
check_slice(const char * path,int force,boolean_t wholedisk,boolean_t isspare)139 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
140 {
141 	char *msg;
142 	int error = 0;
143 	dm_who_type_t who;
144 
145 	if (force)
146 		who = DM_WHO_ZPOOL_FORCE;
147 	else if (isspare)
148 		who = DM_WHO_ZPOOL_SPARE;
149 	else
150 		who = DM_WHO_ZPOOL;
151 
152 	if (dm_inuse((char *)path, &msg, who, &error) || error) {
153 		if (error != 0) {
154 			libdiskmgt_error(error);
155 			return (0);
156 		} else {
157 			vdev_error("%s", msg);
158 			free(msg);
159 			return (-1);
160 		}
161 	}
162 
163 	/*
164 	 * If we're given a whole disk, ignore overlapping slices since we're
165 	 * about to label it anyway.
166 	 */
167 	error = 0;
168 	if (!wholedisk && !force &&
169 	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
170 		if (error == 0) {
171 			/* dm_isoverlapping returned -1 */
172 			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
173 			free(msg);
174 			return (-1);
175 		} else if (error != ENODEV) {
176 			/* libdiskmgt's devcache only handles physical drives */
177 			libdiskmgt_error(error);
178 			return (0);
179 		}
180 	}
181 
182 	return (0);
183 }
184 
185 
186 /*
187  * Validate a whole disk.  Iterate over all slices on the disk and make sure
188  * that none is in use by calling check_slice().
189  */
190 static int
check_disk(const char * name,dm_descriptor_t disk,int force,int isspare)191 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
192 {
193 	dm_descriptor_t *drive, *media, *slice;
194 	int err = 0;
195 	int i;
196 	int ret;
197 
198 	/*
199 	 * Get the drive associated with this disk.  This should never fail,
200 	 * because we already have an alias handle open for the device.
201 	 */
202 	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
203 	    &err)) == NULL || *drive == NULL) {
204 		if (err)
205 			libdiskmgt_error(err);
206 		return (0);
207 	}
208 
209 	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
210 	    &err)) == NULL) {
211 		dm_free_descriptors(drive);
212 		if (err)
213 			libdiskmgt_error(err);
214 		return (0);
215 	}
216 
217 	dm_free_descriptors(drive);
218 
219 	/*
220 	 * It is possible that the user has specified a removable media drive,
221 	 * and the media is not present.
222 	 */
223 	if (*media == NULL) {
224 		dm_free_descriptors(media);
225 		vdev_error(gettext("'%s' has no media in drive\n"), name);
226 		return (-1);
227 	}
228 
229 	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
230 	    &err)) == NULL) {
231 		dm_free_descriptors(media);
232 		if (err)
233 			libdiskmgt_error(err);
234 		return (0);
235 	}
236 
237 	dm_free_descriptors(media);
238 
239 	ret = 0;
240 
241 	/*
242 	 * Iterate over all slices and report any errors.  We don't care about
243 	 * overlapping slices because we are using the whole disk.
244 	 */
245 	for (i = 0; slice[i] != NULL; i++) {
246 		char *name = dm_get_name(slice[i], &err);
247 
248 		if (check_slice(name, force, B_TRUE, isspare) != 0)
249 			ret = -1;
250 
251 		dm_free_name(name);
252 	}
253 
254 	dm_free_descriptors(slice);
255 	return (ret);
256 }
257 
258 /*
259  * Validate a device.
260  */
261 static int
check_device(const char * path,boolean_t force,boolean_t isspare)262 check_device(const char *path, boolean_t force, boolean_t isspare)
263 {
264 	dm_descriptor_t desc;
265 	int err;
266 	char *dev;
267 
268 	/*
269 	 * For whole disks, libdiskmgt does not include the leading dev path.
270 	 */
271 	dev = strrchr(path, '/');
272 	assert(dev != NULL);
273 	dev++;
274 	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
275 		err = check_disk(path, desc, force, isspare);
276 		dm_free_descriptor(desc);
277 		return (err);
278 	}
279 
280 	return (check_slice(path, force, B_FALSE, isspare));
281 }
282 #endif	/* illumos */
283 
284 /*
285  * Check that a file is valid.  All we can do in this case is check that it's
286  * not in use by another pool, and not in use by swap.
287  */
288 static int
check_file(const char * file,boolean_t force,boolean_t isspare)289 check_file(const char *file, boolean_t force, boolean_t isspare)
290 {
291 	char  *name;
292 	int fd;
293 	int ret = 0;
294 	int err;
295 	pool_state_t state;
296 	boolean_t inuse;
297 
298 #ifdef illumos
299 	if (dm_inuse_swap(file, &err)) {
300 		if (err)
301 			libdiskmgt_error(err);
302 		else
303 			vdev_error(gettext("%s is currently used by swap. "
304 			    "Please see swap(1M).\n"), file);
305 		return (-1);
306 	}
307 #endif
308 
309 	if ((fd = open(file, O_RDONLY)) < 0)
310 		return (0);
311 
312 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
313 		const char *desc;
314 
315 		switch (state) {
316 		case POOL_STATE_ACTIVE:
317 			desc = gettext("active");
318 			break;
319 
320 		case POOL_STATE_EXPORTED:
321 			desc = gettext("exported");
322 			break;
323 
324 		case POOL_STATE_POTENTIALLY_ACTIVE:
325 			desc = gettext("potentially active");
326 			break;
327 
328 		default:
329 			desc = gettext("unknown");
330 			break;
331 		}
332 
333 		/*
334 		 * Allow hot spares to be shared between pools.
335 		 */
336 		if (state == POOL_STATE_SPARE && isspare)
337 			return (0);
338 
339 		if (state == POOL_STATE_ACTIVE ||
340 		    state == POOL_STATE_SPARE || !force) {
341 			switch (state) {
342 			case POOL_STATE_SPARE:
343 				vdev_error(gettext("%s is reserved as a hot "
344 				    "spare for pool %s\n"), file, name);
345 				break;
346 			default:
347 				vdev_error(gettext("%s is part of %s pool "
348 				    "'%s'\n"), file, desc, name);
349 				break;
350 			}
351 			ret = -1;
352 		}
353 
354 		free(name);
355 	}
356 
357 	(void) close(fd);
358 	return (ret);
359 }
360 
361 static int
check_device(const char * name,boolean_t force,boolean_t isspare)362 check_device(const char *name, boolean_t force, boolean_t isspare)
363 {
364 	char path[MAXPATHLEN];
365 
366 	if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0)
367 		snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name);
368 	else
369 		strlcpy(path, name, sizeof(path));
370 
371 	return (check_file(path, force, isspare));
372 }
373 
374 /*
375  * By "whole disk" we mean an entire physical disk (something we can
376  * label, toggle the write cache on, etc.) as opposed to the full
377  * capacity of a pseudo-device such as lofi or did.  We act as if we
378  * are labeling the disk, which should be a pretty good test of whether
379  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
380  * it isn't.
381  */
382 static boolean_t
is_whole_disk(const char * arg)383 is_whole_disk(const char *arg)
384 {
385 #ifdef illumos
386 	struct dk_gpt *label;
387 	int	fd;
388 	char	path[MAXPATHLEN];
389 
390 	(void) snprintf(path, sizeof (path), "%s%s%s",
391 	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
392 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
393 		return (B_FALSE);
394 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
395 		(void) close(fd);
396 		return (B_FALSE);
397 	}
398 	efi_free(label);
399 	(void) close(fd);
400 	return (B_TRUE);
401 #endif
402 #ifdef __FreeBSD__
403 	int fd;
404 
405 	fd = g_open(arg, 0);
406 	if (fd >= 0) {
407 		g_close(fd);
408 		return (B_TRUE);
409 	}
410 	return (B_FALSE);
411 #endif
412 #ifdef __NetBSD__
413 	struct disklabel dl;
414 	int fd, rv;
415 
416 	if ((fd = open(arg, O_RDWR | O_NONBLOCK)) < 0)
417 		return (B_FALSE);
418 
419 	rv = ioctl(fd, DIOCGDINFO, &dl);
420 	close(fd);
421 	return (rv == 0);
422 #endif
423 }
424 
425 /*
426  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
427  * device, fill in the device id to make a complete nvlist.  Valid forms for a
428  * leaf vdev are:
429  *
430  * 	/dev/dsk/xxx	Complete disk path
431  * 	/xxx		Full path to file
432  * 	xxx		Shorthand for /dev/dsk/xxx
433  */
434 static nvlist_t *
make_leaf_vdev(const char * arg,uint64_t is_log)435 make_leaf_vdev(const char *arg, uint64_t is_log)
436 {
437 	char path[MAXPATHLEN];
438 	struct stat64 statbuf;
439 	nvlist_t *vdev = NULL;
440 	char *type = NULL;
441 	boolean_t wholedisk = B_FALSE;
442 
443 	/*
444 	 * Determine what type of vdev this is, and put the full path into
445 	 * 'path'.  We detect whether this is a device of file afterwards by
446 	 * checking the st_mode of the file.
447 	 */
448 	if (arg[0] == '/') {
449 		/*
450 		 * Complete device or file path.  Exact type is determined by
451 		 * examining the file descriptor afterwards.
452 		 */
453 		wholedisk = is_whole_disk(arg);
454 		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
455 			(void) fprintf(stderr,
456 			    gettext("cannot open '%s': %s\n"),
457 			    arg, strerror(errno));
458 			return (NULL);
459 		}
460 
461 		(void) strlcpy(path, arg, sizeof (path));
462 	} else {
463 		/*
464 		 * This may be a short path for a device, or it could be total
465 		 * gibberish.  Check to see if it's a known device in
466 		 * /dev/dsk/.  As part of this check, see if we've been given a
467 		 * an entire disk (minus the slice number).
468 		 */
469 		if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
470 			strlcpy(path, arg, sizeof (path));
471 		else
472 			snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg);
473 		wholedisk = is_whole_disk(path);
474 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
475 			/*
476 			 * If we got ENOENT, then the user gave us
477 			 * gibberish, so try to direct them with a
478 			 * reasonable error message.  Otherwise,
479 			 * regurgitate strerror() since it's the best we
480 			 * can do.
481 			 */
482 			if (errno == ENOENT) {
483 				(void) fprintf(stderr,
484 				    gettext("cannot open '%s': no such "
485 				    "device\n"), arg);
486 				(void) fprintf(stderr,
487 				    gettext("must be a full path or "
488 				    "shorthand device name\n"));
489 				return (NULL);
490 			} else {
491 				(void) fprintf(stderr,
492 				    gettext("cannot open '%s': %s\n"),
493 				    path, strerror(errno));
494 				return (NULL);
495 			}
496 		}
497 	}
498 
499 #ifdef __FreeBSD__
500 	if (S_ISCHR(statbuf.st_mode)) {
501 		statbuf.st_mode &= ~S_IFCHR;
502 		statbuf.st_mode |= S_IFBLK;
503 		wholedisk = B_FALSE;
504 	}
505 #endif
506 
507 	/*
508 	 * Determine whether this is a device or a file.
509 	 */
510 	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
511 		type = VDEV_TYPE_DISK;
512 	} else if (S_ISREG(statbuf.st_mode)) {
513 		type = VDEV_TYPE_FILE;
514 	} else {
515 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
516 		    "device or regular file\n"), path);
517 		return (NULL);
518 	}
519 
520 	/*
521 	 * Finally, we have the complete device or file, and we know that it is
522 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
523 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
524 	 */
525 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
526 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
527 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
528 	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
529 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
530 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
531 		    (uint64_t)wholedisk) == 0);
532 
533 #ifdef have_devid
534 	/*
535 	 * For a whole disk, defer getting its devid until after labeling it.
536 	 */
537 	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
538 		/*
539 		 * Get the devid for the device.
540 		 */
541 		int fd;
542 		ddi_devid_t devid;
543 		char *minor = NULL, *devid_str = NULL;
544 
545 		if ((fd = open(path, O_RDONLY)) < 0) {
546 			(void) fprintf(stderr, gettext("cannot open '%s': "
547 			    "%s\n"), path, strerror(errno));
548 			nvlist_free(vdev);
549 			return (NULL);
550 		}
551 
552 		if (devid_get(fd, &devid) == 0) {
553 			if (devid_get_minor_name(fd, &minor) == 0 &&
554 			    (devid_str = devid_str_encode(devid, minor)) !=
555 			    NULL) {
556 				verify(nvlist_add_string(vdev,
557 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
558 			}
559 			if (devid_str != NULL)
560 				devid_str_free(devid_str);
561 			if (minor != NULL)
562 				devid_str_free(minor);
563 			devid_free(devid);
564 		}
565 
566 		(void) close(fd);
567 	}
568 #endif
569 
570 	return (vdev);
571 }
572 
573 /*
574  * Go through and verify the replication level of the pool is consistent.
575  * Performs the following checks:
576  *
577  * 	For the new spec, verifies that devices in mirrors and raidz are the
578  * 	same size.
579  *
580  * 	If the current configuration already has inconsistent replication
581  * 	levels, ignore any other potential problems in the new spec.
582  *
583  * 	Otherwise, make sure that the current spec (if there is one) and the new
584  * 	spec have consistent replication levels.
585  */
586 typedef struct replication_level {
587 	char *zprl_type;
588 	uint64_t zprl_children;
589 	uint64_t zprl_parity;
590 } replication_level_t;
591 
592 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
593 
594 /*
595  * Given a list of toplevel vdevs, return the current replication level.  If
596  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
597  * an error message will be displayed for each self-inconsistent vdev.
598  */
599 static replication_level_t *
get_replication(nvlist_t * nvroot,boolean_t fatal)600 get_replication(nvlist_t *nvroot, boolean_t fatal)
601 {
602 	nvlist_t **top;
603 	uint_t t, toplevels;
604 	nvlist_t **child;
605 	uint_t c, children;
606 	nvlist_t *nv;
607 	char *type;
608 	replication_level_t lastrep = {0};
609 	replication_level_t rep;
610 	replication_level_t *ret;
611 	boolean_t dontreport;
612 
613 	ret = safe_malloc(sizeof (replication_level_t));
614 
615 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
616 	    &top, &toplevels) == 0);
617 
618 	for (t = 0; t < toplevels; t++) {
619 		uint64_t is_log = B_FALSE;
620 
621 		nv = top[t];
622 
623 		/*
624 		 * For separate logs we ignore the top level vdev replication
625 		 * constraints.
626 		 */
627 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
628 		if (is_log)
629 			continue;
630 
631 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
632 		    &type) == 0);
633 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
634 		    &child, &children) != 0) {
635 			/*
636 			 * This is a 'file' or 'disk' vdev.
637 			 */
638 			rep.zprl_type = type;
639 			rep.zprl_children = 1;
640 			rep.zprl_parity = 0;
641 		} else {
642 			uint64_t vdev_size;
643 
644 			/*
645 			 * This is a mirror or RAID-Z vdev.  Go through and make
646 			 * sure the contents are all the same (files vs. disks),
647 			 * keeping track of the number of elements in the
648 			 * process.
649 			 *
650 			 * We also check that the size of each vdev (if it can
651 			 * be determined) is the same.
652 			 */
653 			rep.zprl_type = type;
654 			rep.zprl_children = 0;
655 
656 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
657 				verify(nvlist_lookup_uint64(nv,
658 				    ZPOOL_CONFIG_NPARITY,
659 				    &rep.zprl_parity) == 0);
660 				assert(rep.zprl_parity != 0);
661 			} else {
662 				rep.zprl_parity = 0;
663 			}
664 
665 			/*
666 			 * The 'dontreport' variable indicates that we've
667 			 * already reported an error for this spec, so don't
668 			 * bother doing it again.
669 			 */
670 			type = NULL;
671 			dontreport = 0;
672 			vdev_size = -1ULL;
673 			for (c = 0; c < children; c++) {
674 				boolean_t is_replacing, is_spare;
675 				nvlist_t *cnv = child[c];
676 				char *path;
677 				struct stat64 statbuf;
678 				uint64_t size = -1ULL;
679 				char *childtype;
680 				int fd, err;
681 
682 				rep.zprl_children++;
683 
684 				verify(nvlist_lookup_string(cnv,
685 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
686 
687 				/*
688 				 * If this is a replacing or spare vdev, then
689 				 * get the real first child of the vdev.
690 				 */
691 				is_replacing = strcmp(childtype,
692 				    VDEV_TYPE_REPLACING) == 0;
693 				is_spare = strcmp(childtype,
694 				    VDEV_TYPE_SPARE) == 0;
695 				if (is_replacing || is_spare) {
696 					nvlist_t **rchild;
697 					uint_t rchildren;
698 
699 					verify(nvlist_lookup_nvlist_array(cnv,
700 					    ZPOOL_CONFIG_CHILDREN, &rchild,
701 					    &rchildren) == 0);
702 					assert((is_replacing && rchildren == 2)
703 					    || (is_spare && rchildren >= 2));
704 					cnv = rchild[0];
705 
706 					verify(nvlist_lookup_string(cnv,
707 					    ZPOOL_CONFIG_TYPE,
708 					    &childtype) == 0);
709 				}
710 
711 				verify(nvlist_lookup_string(cnv,
712 				    ZPOOL_CONFIG_PATH, &path) == 0);
713 
714 				/*
715 				 * If we have a raidz/mirror that combines disks
716 				 * with files, report it as an error.
717 				 */
718 				if (!dontreport && type != NULL &&
719 				    strcmp(type, childtype) != 0) {
720 					if (ret != NULL)
721 						free(ret);
722 					ret = NULL;
723 					if (fatal)
724 						vdev_error(gettext(
725 						    "mismatched replication "
726 						    "level: %s contains both "
727 						    "files and devices\n"),
728 						    rep.zprl_type);
729 					else
730 						return (NULL);
731 					dontreport = B_TRUE;
732 				}
733 
734 				/*
735 				 * According to stat(2), the value of 'st_size'
736 				 * is undefined for block devices and character
737 				 * devices.  But there is no effective way to
738 				 * determine the real size in userland.
739 				 *
740 				 * Instead, we'll take advantage of an
741 				 * implementation detail of spec_size().  If the
742 				 * device is currently open, then we (should)
743 				 * return a valid size.
744 				 *
745 				 * If we still don't get a valid size (indicated
746 				 * by a size of 0 or MAXOFFSET_T), then ignore
747 				 * this device altogether.
748 				 */
749 				if ((fd = open(path, O_RDONLY)) >= 0) {
750 					err = fstat64(fd, &statbuf);
751 					(void) close(fd);
752 				} else {
753 					err = stat64(path, &statbuf);
754 				}
755 
756 				if (err != 0 ||
757 				    statbuf.st_size == 0 ||
758 				    statbuf.st_size == MAXOFFSET_T)
759 					continue;
760 
761 				size = statbuf.st_size;
762 
763 				/*
764 				 * Also make sure that devices and
765 				 * slices have a consistent size.  If
766 				 * they differ by a significant amount
767 				 * (~16MB) then report an error.
768 				 */
769 				if (!dontreport &&
770 				    (vdev_size != -1ULL &&
771 				    (labs(size - vdev_size) >
772 				    ZPOOL_FUZZ))) {
773 					if (ret != NULL)
774 						free(ret);
775 					ret = NULL;
776 					if (fatal)
777 						vdev_error(gettext(
778 						    "%s contains devices of "
779 						    "different sizes\n"),
780 						    rep.zprl_type);
781 					else
782 						return (NULL);
783 					dontreport = B_TRUE;
784 				}
785 
786 				type = childtype;
787 				vdev_size = size;
788 			}
789 		}
790 
791 		/*
792 		 * At this point, we have the replication of the last toplevel
793 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if its
794 		 * different.
795 		 */
796 		if (lastrep.zprl_type != NULL) {
797 			if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
798 				if (ret != NULL)
799 					free(ret);
800 				ret = NULL;
801 				if (fatal)
802 					vdev_error(gettext(
803 					    "mismatched replication level: "
804 					    "both %s and %s vdevs are "
805 					    "present\n"),
806 					    lastrep.zprl_type, rep.zprl_type);
807 				else
808 					return (NULL);
809 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
810 				if (ret)
811 					free(ret);
812 				ret = NULL;
813 				if (fatal)
814 					vdev_error(gettext(
815 					    "mismatched replication level: "
816 					    "both %llu and %llu device parity "
817 					    "%s vdevs are present\n"),
818 					    lastrep.zprl_parity,
819 					    rep.zprl_parity,
820 					    rep.zprl_type);
821 				else
822 					return (NULL);
823 			} else if (lastrep.zprl_children != rep.zprl_children) {
824 				if (ret)
825 					free(ret);
826 				ret = NULL;
827 				if (fatal)
828 					vdev_error(gettext(
829 					    "mismatched replication level: "
830 					    "both %llu-way and %llu-way %s "
831 					    "vdevs are present\n"),
832 					    lastrep.zprl_children,
833 					    rep.zprl_children,
834 					    rep.zprl_type);
835 				else
836 					return (NULL);
837 			}
838 		}
839 		lastrep = rep;
840 	}
841 
842 	if (ret != NULL)
843 		*ret = rep;
844 
845 	return (ret);
846 }
847 
848 /*
849  * Check the replication level of the vdev spec against the current pool.  Calls
850  * get_replication() to make sure the new spec is self-consistent.  If the pool
851  * has a consistent replication level, then we ignore any errors.  Otherwise,
852  * report any difference between the two.
853  */
854 static int
check_replication(nvlist_t * config,nvlist_t * newroot)855 check_replication(nvlist_t *config, nvlist_t *newroot)
856 {
857 	nvlist_t **child;
858 	uint_t	children;
859 	replication_level_t *current = NULL, *new;
860 	int ret;
861 
862 	/*
863 	 * If we have a current pool configuration, check to see if it's
864 	 * self-consistent.  If not, simply return success.
865 	 */
866 	if (config != NULL) {
867 		nvlist_t *nvroot;
868 
869 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
870 		    &nvroot) == 0);
871 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
872 			return (0);
873 	}
874 	/*
875 	 * for spares there may be no children, and therefore no
876 	 * replication level to check
877 	 */
878 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
879 	    &child, &children) != 0) || (children == 0)) {
880 		free(current);
881 		return (0);
882 	}
883 
884 	/*
885 	 * If all we have is logs then there's no replication level to check.
886 	 */
887 	if (num_logs(newroot) == children) {
888 		free(current);
889 		return (0);
890 	}
891 
892 	/*
893 	 * Get the replication level of the new vdev spec, reporting any
894 	 * inconsistencies found.
895 	 */
896 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
897 		free(current);
898 		return (-1);
899 	}
900 
901 	/*
902 	 * Check to see if the new vdev spec matches the replication level of
903 	 * the current pool.
904 	 */
905 	ret = 0;
906 	if (current != NULL) {
907 		if (strcmp(current->zprl_type, new->zprl_type) != 0) {
908 			vdev_error(gettext(
909 			    "mismatched replication level: pool uses %s "
910 			    "and new vdev is %s\n"),
911 			    current->zprl_type, new->zprl_type);
912 			ret = -1;
913 		} else if (current->zprl_parity != new->zprl_parity) {
914 			vdev_error(gettext(
915 			    "mismatched replication level: pool uses %llu "
916 			    "device parity and new vdev uses %llu\n"),
917 			    current->zprl_parity, new->zprl_parity);
918 			ret = -1;
919 		} else if (current->zprl_children != new->zprl_children) {
920 			vdev_error(gettext(
921 			    "mismatched replication level: pool uses %llu-way "
922 			    "%s and new vdev uses %llu-way %s\n"),
923 			    current->zprl_children, current->zprl_type,
924 			    new->zprl_children, new->zprl_type);
925 			ret = -1;
926 		}
927 	}
928 
929 	free(new);
930 	if (current != NULL)
931 		free(current);
932 
933 	return (ret);
934 }
935 
936 #ifdef illumos
937 /*
938  * Go through and find any whole disks in the vdev specification, labelling them
939  * as appropriate.  When constructing the vdev spec, we were unable to open this
940  * device in order to provide a devid.  Now that we have labelled the disk and
941  * know that slice 0 is valid, we can construct the devid now.
942  *
943  * If the disk was already labeled with an EFI label, we will have gotten the
944  * devid already (because we were able to open the whole disk).  Otherwise, we
945  * need to get the devid after we label the disk.
946  */
947 static int
make_disks(zpool_handle_t * zhp,nvlist_t * nv)948 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
949 {
950 	nvlist_t **child;
951 	uint_t c, children;
952 	char *type, *path, *diskname;
953 	char buf[MAXPATHLEN];
954 	uint64_t wholedisk;
955 	int fd;
956 	int ret;
957 	ddi_devid_t devid;
958 	char *minor = NULL, *devid_str = NULL;
959 
960 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
961 
962 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
963 	    &child, &children) != 0) {
964 
965 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
966 			return (0);
967 
968 		/*
969 		 * We have a disk device.  Get the path to the device
970 		 * and see if it's a whole disk by appending the backup
971 		 * slice and stat()ing the device.
972 		 */
973 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
974 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
975 		    &wholedisk) != 0 || !wholedisk)
976 			return (0);
977 
978 		diskname = strrchr(path, '/');
979 		assert(diskname != NULL);
980 		diskname++;
981 		if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
982 			return (-1);
983 
984 		/*
985 		 * Fill in the devid, now that we've labeled the disk.
986 		 */
987 		(void) snprintf(buf, sizeof (buf), "%ss0", path);
988 		if ((fd = open(buf, O_RDONLY)) < 0) {
989 			(void) fprintf(stderr,
990 			    gettext("cannot open '%s': %s\n"),
991 			    buf, strerror(errno));
992 			return (-1);
993 		}
994 
995 		if (devid_get(fd, &devid) == 0) {
996 			if (devid_get_minor_name(fd, &minor) == 0 &&
997 			    (devid_str = devid_str_encode(devid, minor)) !=
998 			    NULL) {
999 				verify(nvlist_add_string(nv,
1000 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
1001 			}
1002 			if (devid_str != NULL)
1003 				devid_str_free(devid_str);
1004 			if (minor != NULL)
1005 				devid_str_free(minor);
1006 			devid_free(devid);
1007 		}
1008 
1009 		/*
1010 		 * Update the path to refer to the 's0' slice.  The presence of
1011 		 * the 'whole_disk' field indicates to the CLI that we should
1012 		 * chop off the slice number when displaying the device in
1013 		 * future output.
1014 		 */
1015 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
1016 
1017 		(void) close(fd);
1018 
1019 		return (0);
1020 	}
1021 
1022 	for (c = 0; c < children; c++)
1023 		if ((ret = make_disks(zhp, child[c])) != 0)
1024 			return (ret);
1025 
1026 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1027 	    &child, &children) == 0)
1028 		for (c = 0; c < children; c++)
1029 			if ((ret = make_disks(zhp, child[c])) != 0)
1030 				return (ret);
1031 
1032 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1033 	    &child, &children) == 0)
1034 		for (c = 0; c < children; c++)
1035 			if ((ret = make_disks(zhp, child[c])) != 0)
1036 				return (ret);
1037 
1038 	return (0);
1039 }
1040 #endif	/* illumos */
1041 
1042 /*
1043  * Determine if the given path is a hot spare within the given configuration.
1044  */
1045 static boolean_t
is_spare(nvlist_t * config,const char * path)1046 is_spare(nvlist_t *config, const char *path)
1047 {
1048 	int fd;
1049 	pool_state_t state;
1050 	char *name = NULL;
1051 	nvlist_t *label;
1052 	uint64_t guid, spareguid;
1053 	nvlist_t *nvroot;
1054 	nvlist_t **spares;
1055 	uint_t i, nspares;
1056 	boolean_t inuse;
1057 
1058 	if ((fd = open(path, O_RDONLY)) < 0)
1059 		return (B_FALSE);
1060 
1061 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1062 	    !inuse ||
1063 	    state != POOL_STATE_SPARE ||
1064 	    zpool_read_label(fd, &label) != 0) {
1065 		free(name);
1066 		(void) close(fd);
1067 		return (B_FALSE);
1068 	}
1069 	free(name);
1070 	(void) close(fd);
1071 
1072 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1073 	nvlist_free(label);
1074 
1075 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1076 	    &nvroot) == 0);
1077 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1078 	    &spares, &nspares) == 0) {
1079 		for (i = 0; i < nspares; i++) {
1080 			verify(nvlist_lookup_uint64(spares[i],
1081 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
1082 			if (spareguid == guid)
1083 				return (B_TRUE);
1084 		}
1085 	}
1086 
1087 	return (B_FALSE);
1088 }
1089 
1090 /*
1091  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1092  * the majority of this task.
1093  */
1094 static boolean_t
is_device_in_use(nvlist_t * config,nvlist_t * nv,boolean_t force,boolean_t replacing,boolean_t isspare)1095 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1096     boolean_t replacing, boolean_t isspare)
1097 {
1098 	nvlist_t **child;
1099 	uint_t c, children;
1100 	char *type, *path;
1101 	int ret = 0;
1102 	char buf[MAXPATHLEN];
1103 	uint64_t wholedisk;
1104 	boolean_t anyinuse = B_FALSE;
1105 
1106 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1107 
1108 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1109 	    &child, &children) != 0) {
1110 
1111 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1112 
1113 		/*
1114 		 * As a generic check, we look to see if this is a replace of a
1115 		 * hot spare within the same pool.  If so, we allow it
1116 		 * regardless of what libdiskmgt or zpool_in_use() says.
1117 		 */
1118 		if (replacing) {
1119 #ifdef illumos
1120 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1121 			    &wholedisk) == 0 && wholedisk)
1122 				(void) snprintf(buf, sizeof (buf), "%ss0",
1123 				    path);
1124 			else
1125 #endif
1126 				(void) strlcpy(buf, path, sizeof (buf));
1127 
1128 			if (is_spare(config, buf))
1129 				return (B_FALSE);
1130 		}
1131 
1132 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1133 			ret = check_device(path, force, isspare);
1134 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1135 			ret = check_file(path, force, isspare);
1136 
1137 		return (ret != 0);
1138 	}
1139 
1140 	for (c = 0; c < children; c++)
1141 		if (is_device_in_use(config, child[c], force, replacing,
1142 		    B_FALSE))
1143 			anyinuse = B_TRUE;
1144 
1145 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1146 	    &child, &children) == 0)
1147 		for (c = 0; c < children; c++)
1148 			if (is_device_in_use(config, child[c], force, replacing,
1149 			    B_TRUE))
1150 				anyinuse = B_TRUE;
1151 
1152 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1153 	    &child, &children) == 0)
1154 		for (c = 0; c < children; c++)
1155 			if (is_device_in_use(config, child[c], force, replacing,
1156 			    B_FALSE))
1157 				anyinuse = B_TRUE;
1158 
1159 	return (anyinuse);
1160 }
1161 
1162 static const char *
is_grouping(const char * type,int * mindev,int * maxdev)1163 is_grouping(const char *type, int *mindev, int *maxdev)
1164 {
1165 	if (strncmp(type, "raidz", 5) == 0) {
1166 		const char *p = type + 5;
1167 		char *end;
1168 		long nparity;
1169 
1170 		if (*p == '\0') {
1171 			nparity = 1;
1172 		} else if (*p == '0') {
1173 			return (NULL); /* no zero prefixes allowed */
1174 		} else {
1175 			errno = 0;
1176 			nparity = strtol(p, &end, 10);
1177 			if (errno != 0 || nparity < 1 || nparity >= 255 ||
1178 			    *end != '\0')
1179 				return (NULL);
1180 		}
1181 
1182 		if (mindev != NULL)
1183 			*mindev = nparity + 1;
1184 		if (maxdev != NULL)
1185 			*maxdev = 255;
1186 		return (VDEV_TYPE_RAIDZ);
1187 	}
1188 
1189 	if (maxdev != NULL)
1190 		*maxdev = INT_MAX;
1191 
1192 	if (strcmp(type, "mirror") == 0) {
1193 		if (mindev != NULL)
1194 			*mindev = 2;
1195 		return (VDEV_TYPE_MIRROR);
1196 	}
1197 
1198 	if (strcmp(type, "spare") == 0) {
1199 		if (mindev != NULL)
1200 			*mindev = 1;
1201 		return (VDEV_TYPE_SPARE);
1202 	}
1203 
1204 	if (strcmp(type, "log") == 0) {
1205 		if (mindev != NULL)
1206 			*mindev = 1;
1207 		return (VDEV_TYPE_LOG);
1208 	}
1209 
1210 	if (strcmp(type, "cache") == 0) {
1211 		if (mindev != NULL)
1212 			*mindev = 1;
1213 		return (VDEV_TYPE_L2CACHE);
1214 	}
1215 
1216 	return (NULL);
1217 }
1218 
1219 /*
1220  * Construct a syntactically valid vdev specification,
1221  * and ensure that all devices and files exist and can be opened.
1222  * Note: we don't bother freeing anything in the error paths
1223  * because the program is just going to exit anyway.
1224  */
1225 nvlist_t *
construct_spec(int argc,char ** argv)1226 construct_spec(int argc, char **argv)
1227 {
1228 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1229 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1230 	const char *type;
1231 	uint64_t is_log;
1232 	boolean_t seen_logs;
1233 
1234 	top = NULL;
1235 	toplevels = 0;
1236 	spares = NULL;
1237 	l2cache = NULL;
1238 	nspares = 0;
1239 	nlogs = 0;
1240 	nl2cache = 0;
1241 	is_log = B_FALSE;
1242 	seen_logs = B_FALSE;
1243 
1244 	while (argc > 0) {
1245 		nv = NULL;
1246 
1247 		/*
1248 		 * If it's a mirror or raidz, the subsequent arguments are
1249 		 * its leaves -- until we encounter the next mirror or raidz.
1250 		 */
1251 		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1252 			nvlist_t **child = NULL;
1253 			int c, children = 0;
1254 
1255 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1256 				if (spares != NULL) {
1257 					(void) fprintf(stderr,
1258 					    gettext("invalid vdev "
1259 					    "specification: 'spare' can be "
1260 					    "specified only once\n"));
1261 					return (NULL);
1262 				}
1263 				is_log = B_FALSE;
1264 			}
1265 
1266 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1267 				if (seen_logs) {
1268 					(void) fprintf(stderr,
1269 					    gettext("invalid vdev "
1270 					    "specification: 'log' can be "
1271 					    "specified only once\n"));
1272 					return (NULL);
1273 				}
1274 				seen_logs = B_TRUE;
1275 				is_log = B_TRUE;
1276 				argc--;
1277 				argv++;
1278 				/*
1279 				 * A log is not a real grouping device.
1280 				 * We just set is_log and continue.
1281 				 */
1282 				continue;
1283 			}
1284 
1285 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1286 				if (l2cache != NULL) {
1287 					(void) fprintf(stderr,
1288 					    gettext("invalid vdev "
1289 					    "specification: 'cache' can be "
1290 					    "specified only once\n"));
1291 					return (NULL);
1292 				}
1293 				is_log = B_FALSE;
1294 			}
1295 
1296 			if (is_log) {
1297 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1298 					(void) fprintf(stderr,
1299 					    gettext("invalid vdev "
1300 					    "specification: unsupported 'log' "
1301 					    "device: %s\n"), type);
1302 					return (NULL);
1303 				}
1304 				nlogs++;
1305 			}
1306 
1307 			for (c = 1; c < argc; c++) {
1308 				if (is_grouping(argv[c], NULL, NULL) != NULL)
1309 					break;
1310 				children++;
1311 				child = realloc(child,
1312 				    children * sizeof (nvlist_t *));
1313 				if (child == NULL)
1314 					zpool_no_memory();
1315 				if ((nv = make_leaf_vdev(argv[c], B_FALSE))
1316 				    == NULL)
1317 					return (NULL);
1318 				child[children - 1] = nv;
1319 			}
1320 
1321 			if (children < mindev) {
1322 				(void) fprintf(stderr, gettext("invalid vdev "
1323 				    "specification: %s requires at least %d "
1324 				    "devices\n"), argv[0], mindev);
1325 				return (NULL);
1326 			}
1327 
1328 			if (children > maxdev) {
1329 				(void) fprintf(stderr, gettext("invalid vdev "
1330 				    "specification: %s supports no more than "
1331 				    "%d devices\n"), argv[0], maxdev);
1332 				return (NULL);
1333 			}
1334 
1335 			argc -= c;
1336 			argv += c;
1337 
1338 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1339 				spares = child;
1340 				nspares = children;
1341 				continue;
1342 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1343 				l2cache = child;
1344 				nl2cache = children;
1345 				continue;
1346 			} else {
1347 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1348 				    0) == 0);
1349 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1350 				    type) == 0);
1351 				verify(nvlist_add_uint64(nv,
1352 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1353 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1354 					verify(nvlist_add_uint64(nv,
1355 					    ZPOOL_CONFIG_NPARITY,
1356 					    mindev - 1) == 0);
1357 				}
1358 				verify(nvlist_add_nvlist_array(nv,
1359 				    ZPOOL_CONFIG_CHILDREN, child,
1360 				    children) == 0);
1361 
1362 				for (c = 0; c < children; c++)
1363 					nvlist_free(child[c]);
1364 				free(child);
1365 			}
1366 		} else {
1367 			/*
1368 			 * We have a device.  Pass off to make_leaf_vdev() to
1369 			 * construct the appropriate nvlist describing the vdev.
1370 			 */
1371 			if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
1372 				return (NULL);
1373 			if (is_log)
1374 				nlogs++;
1375 			argc--;
1376 			argv++;
1377 		}
1378 
1379 		toplevels++;
1380 		top = realloc(top, toplevels * sizeof (nvlist_t *));
1381 		if (top == NULL)
1382 			zpool_no_memory();
1383 		top[toplevels - 1] = nv;
1384 	}
1385 
1386 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1387 		(void) fprintf(stderr, gettext("invalid vdev "
1388 		    "specification: at least one toplevel vdev must be "
1389 		    "specified\n"));
1390 		return (NULL);
1391 	}
1392 
1393 	if (seen_logs && nlogs == 0) {
1394 		(void) fprintf(stderr, gettext("invalid vdev specification: "
1395 		    "log requires at least 1 device\n"));
1396 		return (NULL);
1397 	}
1398 
1399 	/*
1400 	 * Finally, create nvroot and add all top-level vdevs to it.
1401 	 */
1402 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1403 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1404 	    VDEV_TYPE_ROOT) == 0);
1405 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1406 	    top, toplevels) == 0);
1407 	if (nspares != 0)
1408 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1409 		    spares, nspares) == 0);
1410 	if (nl2cache != 0)
1411 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1412 		    l2cache, nl2cache) == 0);
1413 
1414 	for (t = 0; t < toplevels; t++)
1415 		nvlist_free(top[t]);
1416 	for (t = 0; t < nspares; t++)
1417 		nvlist_free(spares[t]);
1418 	for (t = 0; t < nl2cache; t++)
1419 		nvlist_free(l2cache[t]);
1420 	if (spares)
1421 		free(spares);
1422 	if (l2cache)
1423 		free(l2cache);
1424 	free(top);
1425 
1426 	return (nvroot);
1427 }
1428 
1429 nvlist_t *
split_mirror_vdev(zpool_handle_t * zhp,char * newname,nvlist_t * props,splitflags_t flags,int argc,char ** argv)1430 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1431     splitflags_t flags, int argc, char **argv)
1432 {
1433 	nvlist_t *newroot = NULL, **child;
1434 	uint_t c, children;
1435 
1436 	if (argc > 0) {
1437 		if ((newroot = construct_spec(argc, argv)) == NULL) {
1438 			(void) fprintf(stderr, gettext("Unable to build a "
1439 			    "pool from the specified devices\n"));
1440 			return (NULL);
1441 		}
1442 
1443 #ifdef illumos
1444 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1445 			nvlist_free(newroot);
1446 			return (NULL);
1447 		}
1448 #endif
1449 
1450 		/* avoid any tricks in the spec */
1451 		verify(nvlist_lookup_nvlist_array(newroot,
1452 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1453 		for (c = 0; c < children; c++) {
1454 			char *path;
1455 			const char *type;
1456 			int min, max;
1457 
1458 			verify(nvlist_lookup_string(child[c],
1459 			    ZPOOL_CONFIG_PATH, &path) == 0);
1460 			if ((type = is_grouping(path, &min, &max)) != NULL) {
1461 				(void) fprintf(stderr, gettext("Cannot use "
1462 				    "'%s' as a device for splitting\n"), type);
1463 				nvlist_free(newroot);
1464 				return (NULL);
1465 			}
1466 		}
1467 	}
1468 
1469 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1470 		nvlist_free(newroot);
1471 		return (NULL);
1472 	}
1473 
1474 	return (newroot);
1475 }
1476 
1477 /*
1478  * Get and validate the contents of the given vdev specification.  This ensures
1479  * that the nvlist returned is well-formed, that all the devices exist, and that
1480  * they are not currently in use by any other known consumer.  The 'poolconfig'
1481  * parameter is the current configuration of the pool when adding devices
1482  * existing pool, and is used to perform additional checks, such as changing the
1483  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1484  * new pool.  The 'force' flag controls whether devices should be forcefully
1485  * added, even if they appear in use.
1486  */
1487 nvlist_t *
make_root_vdev(zpool_handle_t * zhp,int force,int check_rep,boolean_t replacing,boolean_t dryrun,int argc,char ** argv)1488 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
1489     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1490 {
1491 	nvlist_t *newroot;
1492 	nvlist_t *poolconfig = NULL;
1493 	is_force = force;
1494 
1495 	/*
1496 	 * Construct the vdev specification.  If this is successful, we know
1497 	 * that we have a valid specification, and that all devices can be
1498 	 * opened.
1499 	 */
1500 	if ((newroot = construct_spec(argc, argv)) == NULL)
1501 		return (NULL);
1502 
1503 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1504 		return (NULL);
1505 
1506 	/*
1507 	 * Validate each device to make sure that its not shared with another
1508 	 * subsystem.  We do this even if 'force' is set, because there are some
1509 	 * uses (such as a dedicated dump device) that even '-f' cannot
1510 	 * override.
1511 	 */
1512 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1513 		nvlist_free(newroot);
1514 		return (NULL);
1515 	}
1516 
1517 	/*
1518 	 * Check the replication level of the given vdevs and report any errors
1519 	 * found.  We include the existing pool spec, if any, as we need to
1520 	 * catch changes against the existing replication level.
1521 	 */
1522 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1523 		nvlist_free(newroot);
1524 		return (NULL);
1525 	}
1526 
1527 #ifdef illumos
1528 	/*
1529 	 * Run through the vdev specification and label any whole disks found.
1530 	 */
1531 	if (!dryrun && make_disks(zhp, newroot) != 0) {
1532 		nvlist_free(newroot);
1533 		return (NULL);
1534 	}
1535 #endif
1536 
1537 	return (newroot);
1538 }
1539