1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2018 by Delphix. All rights reserved.
26  */
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <errno.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <unistd.h>
34 #include <uuid/uuid.h>
35 #include <zlib.h>
36 #include <libintl.h>
37 #include <sys/types.h>
38 #include <sys/dkio.h>
39 #include <sys/vtoc.h>
40 #include <sys/mhd.h>
41 #include <sys/param.h>
42 #include <sys/dktp/fdisk.h>
43 #include <sys/efi_partition.h>
44 #include <sys/byteorder.h>
45 #include <sys/vdev_disk.h>
46 #include <linux/fs.h>
47 #include <linux/blkpg.h>
48 
49 static struct uuid_to_ptag {
50 	struct uuid	uuid;
51 } conversion_array[] = {
52 	{ EFI_UNUSED },
53 	{ EFI_BOOT },
54 	{ EFI_ROOT },
55 	{ EFI_SWAP },
56 	{ EFI_USR },
57 	{ EFI_BACKUP },
58 	{ EFI_UNUSED },		/* STAND is never used */
59 	{ EFI_VAR },
60 	{ EFI_HOME },
61 	{ EFI_ALTSCTR },
62 	{ EFI_UNUSED },		/* CACHE (cachefs) is never used */
63 	{ EFI_RESERVED },
64 	{ EFI_SYSTEM },
65 	{ EFI_LEGACY_MBR },
66 	{ EFI_SYMC_PUB },
67 	{ EFI_SYMC_CDS },
68 	{ EFI_MSFT_RESV },
69 	{ EFI_DELL_BASIC },
70 	{ EFI_DELL_RAID },
71 	{ EFI_DELL_SWAP },
72 	{ EFI_DELL_LVM },
73 	{ EFI_DELL_RESV },
74 	{ EFI_AAPL_HFS },
75 	{ EFI_AAPL_UFS },
76 	{ EFI_FREEBSD_BOOT },
77 	{ EFI_FREEBSD_SWAP },
78 	{ EFI_FREEBSD_UFS },
79 	{ EFI_FREEBSD_VINUM },
80 	{ EFI_FREEBSD_ZFS },
81 	{ EFI_BIOS_BOOT },
82 	{ EFI_INTC_RS },
83 	{ EFI_SNE_BOOT },
84 	{ EFI_LENOVO_BOOT },
85 	{ EFI_MSFT_LDMM },
86 	{ EFI_MSFT_LDMD },
87 	{ EFI_MSFT_RE },
88 	{ EFI_IBM_GPFS },
89 	{ EFI_MSFT_STORAGESPACES },
90 	{ EFI_HPQ_DATA },
91 	{ EFI_HPQ_SVC },
92 	{ EFI_RHT_DATA },
93 	{ EFI_RHT_HOME },
94 	{ EFI_RHT_SRV },
95 	{ EFI_RHT_DMCRYPT },
96 	{ EFI_RHT_LUKS },
97 	{ EFI_FREEBSD_DISKLABEL },
98 	{ EFI_AAPL_RAID },
99 	{ EFI_AAPL_RAIDOFFLINE },
100 	{ EFI_AAPL_BOOT },
101 	{ EFI_AAPL_LABEL },
102 	{ EFI_AAPL_TVRECOVERY },
103 	{ EFI_AAPL_CORESTORAGE },
104 	{ EFI_NETBSD_SWAP },
105 	{ EFI_NETBSD_FFS },
106 	{ EFI_NETBSD_LFS },
107 	{ EFI_NETBSD_RAID },
108 	{ EFI_NETBSD_CAT },
109 	{ EFI_NETBSD_CRYPT },
110 	{ EFI_GOOG_KERN },
111 	{ EFI_GOOG_ROOT },
112 	{ EFI_GOOG_RESV },
113 	{ EFI_HAIKU_BFS },
114 	{ EFI_MIDNIGHTBSD_BOOT },
115 	{ EFI_MIDNIGHTBSD_DATA },
116 	{ EFI_MIDNIGHTBSD_SWAP },
117 	{ EFI_MIDNIGHTBSD_UFS },
118 	{ EFI_MIDNIGHTBSD_VINUM },
119 	{ EFI_MIDNIGHTBSD_ZFS },
120 	{ EFI_CEPH_JOURNAL },
121 	{ EFI_CEPH_DMCRYPTJOURNAL },
122 	{ EFI_CEPH_OSD },
123 	{ EFI_CEPH_DMCRYPTOSD },
124 	{ EFI_CEPH_CREATE },
125 	{ EFI_CEPH_DMCRYPTCREATE },
126 	{ EFI_OPENBSD_DISKLABEL },
127 	{ EFI_BBRY_QNX },
128 	{ EFI_BELL_PLAN9 },
129 	{ EFI_VMW_KCORE },
130 	{ EFI_VMW_VMFS },
131 	{ EFI_VMW_RESV },
132 	{ EFI_RHT_ROOTX86 },
133 	{ EFI_RHT_ROOTAMD64 },
134 	{ EFI_RHT_ROOTARM },
135 	{ EFI_RHT_ROOTARM64 },
136 	{ EFI_ACRONIS_SECUREZONE },
137 	{ EFI_ONIE_BOOT },
138 	{ EFI_ONIE_CONFIG },
139 	{ EFI_IBM_PPRPBOOT },
140 	{ EFI_FREEDESKTOP_BOOT }
141 };
142 
143 /*
144  * Default vtoc information for non-SVr4 partitions
145  */
146 struct dk_map2  default_vtoc_map[NDKMAP] = {
147 	{	V_ROOT,		0	},		/* a - 0 */
148 	{	V_SWAP,		V_UNMNT	},		/* b - 1 */
149 	{	V_BACKUP,	V_UNMNT	},		/* c - 2 */
150 	{	V_UNASSIGNED,	0	},		/* d - 3 */
151 	{	V_UNASSIGNED,	0	},		/* e - 4 */
152 	{	V_UNASSIGNED,	0	},		/* f - 5 */
153 	{	V_USR,		0	},		/* g - 6 */
154 	{	V_UNASSIGNED,	0	},		/* h - 7 */
155 
156 #if defined(_SUNOS_VTOC_16)
157 
158 #if defined(i386) || defined(__amd64) || defined(__arm) || \
159     defined(__powerpc) || defined(__sparc) || defined(__s390__) || \
160     defined(__mips__) || defined(__rv64g__)
161 	{	V_BOOT,		V_UNMNT	},		/* i - 8 */
162 	{	V_ALTSCTR,	0	},		/* j - 9 */
163 
164 #else
165 #error No VTOC format defined.
166 #endif			/* defined(i386) */
167 
168 	{	V_UNASSIGNED,	0	},		/* k - 10 */
169 	{	V_UNASSIGNED,	0	},		/* l - 11 */
170 	{	V_UNASSIGNED,	0	},		/* m - 12 */
171 	{	V_UNASSIGNED,	0	},		/* n - 13 */
172 	{	V_UNASSIGNED,	0	},		/* o - 14 */
173 	{	V_UNASSIGNED,	0	},		/* p - 15 */
174 #endif			/* defined(_SUNOS_VTOC_16) */
175 };
176 
177 int efi_debug = 0;
178 
179 static int efi_read(int, struct dk_gpt *);
180 
181 /*
182  * Return a 32-bit CRC of the contents of the buffer.  Pre-and-post
183  * one's conditioning will be handled by crc32() internally.
184  */
185 static uint32_t
186 efi_crc32(const unsigned char *buf, unsigned int size)
187 {
188 	uint32_t crc = crc32(0, Z_NULL, 0);
189 
190 	crc = crc32(crc, buf, size);
191 
192 	return (crc);
193 }
194 
195 static int
196 read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize)
197 {
198 	int sector_size;
199 	unsigned long long capacity_size;
200 
201 	if (ioctl(fd, BLKSSZGET, &sector_size) < 0)
202 		return (-1);
203 
204 	if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0)
205 		return (-1);
206 
207 	*lbsize = (uint_t)sector_size;
208 	*capacity = (diskaddr_t)(capacity_size / sector_size);
209 
210 	return (0);
211 }
212 
213 /*
214  * Return back the device name associated with the file descriptor. The
215  * caller is responsible for freeing the memory associated with the
216  * returned string.
217  */
218 static char *
219 efi_get_devname(int fd)
220 {
221 	char path[32];
222 
223 	/*
224 	 * The libefi API only provides the open fd and not the file path.
225 	 * To handle this realpath(3) is used to resolve the block device
226 	 * name from /proc/self/fd/<fd>.
227 	 */
228 	(void) snprintf(path, sizeof (path), "/proc/self/fd/%d", fd);
229 	return (realpath(path, NULL));
230 }
231 
232 static int
233 efi_get_info(int fd, struct dk_cinfo *dki_info)
234 {
235 	char *dev_path;
236 	int rval = 0;
237 
238 	memset(dki_info, 0, sizeof (*dki_info));
239 
240 	/*
241 	 * The simplest way to get the partition number under linux is
242 	 * to parse it out of the /dev/<disk><partition> block device name.
243 	 * The kernel creates this using the partition number when it
244 	 * populates /dev/ so it may be trusted.  The tricky bit here is
245 	 * that the naming convention is based on the block device type.
246 	 * So we need to take this in to account when parsing out the
247 	 * partition information.  Aside from the partition number we collect
248 	 * some additional device info.
249 	 */
250 	dev_path = efi_get_devname(fd);
251 	if (dev_path == NULL)
252 		goto error;
253 
254 	if ((strncmp(dev_path, "/dev/sd", 7) == 0)) {
255 		strcpy(dki_info->dki_cname, "sd");
256 		dki_info->dki_ctype = DKC_SCSI_CCS;
257 		rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
258 		    dki_info->dki_dname,
259 		    &dki_info->dki_partition);
260 	} else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) {
261 		strcpy(dki_info->dki_cname, "hd");
262 		dki_info->dki_ctype = DKC_DIRECT;
263 		rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
264 		    dki_info->dki_dname,
265 		    &dki_info->dki_partition);
266 	} else if ((strncmp(dev_path, "/dev/md", 7) == 0)) {
267 		strcpy(dki_info->dki_cname, "pseudo");
268 		dki_info->dki_ctype = DKC_MD;
269 		strcpy(dki_info->dki_dname, "md");
270 		rval = sscanf(dev_path, "/dev/md%[0-9]p%hu",
271 		    dki_info->dki_dname + 2,
272 		    &dki_info->dki_partition);
273 	} else if ((strncmp(dev_path, "/dev/vd", 7) == 0)) {
274 		strcpy(dki_info->dki_cname, "vd");
275 		dki_info->dki_ctype = DKC_MD;
276 		rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
277 		    dki_info->dki_dname,
278 		    &dki_info->dki_partition);
279 	} else if ((strncmp(dev_path, "/dev/xvd", 8) == 0)) {
280 		strcpy(dki_info->dki_cname, "xvd");
281 		dki_info->dki_ctype = DKC_MD;
282 		rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
283 		    dki_info->dki_dname,
284 		    &dki_info->dki_partition);
285 	} else if ((strncmp(dev_path, "/dev/zd", 7) == 0)) {
286 		strcpy(dki_info->dki_cname, "zd");
287 		dki_info->dki_ctype = DKC_MD;
288 		strcpy(dki_info->dki_dname, "zd");
289 		rval = sscanf(dev_path, "/dev/zd%[0-9]p%hu",
290 		    dki_info->dki_dname + 2,
291 		    &dki_info->dki_partition);
292 	} else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) {
293 		strcpy(dki_info->dki_cname, "pseudo");
294 		dki_info->dki_ctype = DKC_VBD;
295 		strcpy(dki_info->dki_dname, "dm-");
296 		rval = sscanf(dev_path, "/dev/dm-%[0-9]p%hu",
297 		    dki_info->dki_dname + 3,
298 		    &dki_info->dki_partition);
299 	} else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) {
300 		strcpy(dki_info->dki_cname, "pseudo");
301 		dki_info->dki_ctype = DKC_PCMCIA_MEM;
302 		strcpy(dki_info->dki_dname, "ram");
303 		rval = sscanf(dev_path, "/dev/ram%[0-9]p%hu",
304 		    dki_info->dki_dname + 3,
305 		    &dki_info->dki_partition);
306 	} else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) {
307 		strcpy(dki_info->dki_cname, "pseudo");
308 		dki_info->dki_ctype = DKC_VBD;
309 		strcpy(dki_info->dki_dname, "loop");
310 		rval = sscanf(dev_path, "/dev/loop%[0-9]p%hu",
311 		    dki_info->dki_dname + 4,
312 		    &dki_info->dki_partition);
313 	} else if ((strncmp(dev_path, "/dev/nvme", 9) == 0)) {
314 		strcpy(dki_info->dki_cname, "nvme");
315 		dki_info->dki_ctype = DKC_SCSI_CCS;
316 		strcpy(dki_info->dki_dname, "nvme");
317 		(void) sscanf(dev_path, "/dev/nvme%[0-9]",
318 		    dki_info->dki_dname + 4);
319 		size_t controller_length = strlen(
320 		    dki_info->dki_dname);
321 		strcpy(dki_info->dki_dname + controller_length,
322 		    "n");
323 		rval = sscanf(dev_path,
324 		    "/dev/nvme%*[0-9]n%[0-9]p%hu",
325 		    dki_info->dki_dname + controller_length + 1,
326 		    &dki_info->dki_partition);
327 	} else {
328 		strcpy(dki_info->dki_dname, "unknown");
329 		strcpy(dki_info->dki_cname, "unknown");
330 		dki_info->dki_ctype = DKC_UNKNOWN;
331 	}
332 
333 	switch (rval) {
334 	case 0:
335 		errno = EINVAL;
336 		goto error;
337 	case 1:
338 		dki_info->dki_partition = 0;
339 	}
340 
341 	free(dev_path);
342 
343 	return (0);
344 error:
345 	if (efi_debug)
346 		(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
347 
348 	switch (errno) {
349 	case EIO:
350 		return (VT_EIO);
351 	case EINVAL:
352 		return (VT_EINVAL);
353 	default:
354 		return (VT_ERROR);
355 	}
356 }
357 
358 /*
359  * the number of blocks the EFI label takes up (round up to nearest
360  * block)
361  */
362 #define	NBLOCKS(p, l)	(1 + ((((p) * (int)sizeof (efi_gpe_t))  + \
363 				((l) - 1)) / (l)))
364 /* number of partitions -- limited by what we can malloc */
365 #define	MAX_PARTS	((4294967295UL - sizeof (struct dk_gpt)) / \
366 			    sizeof (struct dk_part))
367 
368 int
369 efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
370 {
371 	diskaddr_t	capacity = 0;
372 	uint_t		lbsize = 0;
373 	uint_t		nblocks;
374 	size_t		length;
375 	struct dk_gpt	*vptr;
376 	struct uuid	uuid;
377 	struct dk_cinfo	dki_info;
378 
379 	if (read_disk_info(fd, &capacity, &lbsize) != 0)
380 		return (-1);
381 
382 	if (efi_get_info(fd, &dki_info) != 0)
383 		return (-1);
384 
385 	if (dki_info.dki_partition != 0)
386 		return (-1);
387 
388 	if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) ||
389 	    (dki_info.dki_ctype == DKC_VBD) ||
390 	    (dki_info.dki_ctype == DKC_UNKNOWN))
391 		return (-1);
392 
393 	nblocks = NBLOCKS(nparts, lbsize);
394 	if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) {
395 		/* 16K plus one block for the GPT */
396 		nblocks = EFI_MIN_ARRAY_SIZE / lbsize + 1;
397 	}
398 
399 	if (nparts > MAX_PARTS) {
400 		if (efi_debug) {
401 			(void) fprintf(stderr,
402 			"the maximum number of partitions supported is %lu\n",
403 			    MAX_PARTS);
404 		}
405 		return (-1);
406 	}
407 
408 	length = sizeof (struct dk_gpt) +
409 	    sizeof (struct dk_part) * (nparts - 1);
410 
411 	vptr = calloc(1, length);
412 	if (vptr == NULL)
413 		return (-1);
414 
415 	*vtoc = vptr;
416 
417 	vptr->efi_version = EFI_VERSION_CURRENT;
418 	vptr->efi_lbasize = lbsize;
419 	vptr->efi_nparts = nparts;
420 	/*
421 	 * add one block here for the PMBR; on disks with a 512 byte
422 	 * block size and 128 or fewer partitions, efi_first_u_lba
423 	 * should work out to "34"
424 	 */
425 	vptr->efi_first_u_lba = nblocks + 1;
426 	vptr->efi_last_lba = capacity - 1;
427 	vptr->efi_altern_lba = capacity -1;
428 	vptr->efi_last_u_lba = vptr->efi_last_lba - nblocks;
429 
430 	(void) uuid_generate((uchar_t *)&uuid);
431 	UUID_LE_CONVERT(vptr->efi_disk_uguid, uuid);
432 	return (0);
433 }
434 
435 /*
436  * Read EFI - return partition number upon success.
437  */
438 int
439 efi_alloc_and_read(int fd, struct dk_gpt **vtoc)
440 {
441 	int			rval;
442 	uint32_t		nparts;
443 	int			length;
444 	struct dk_gpt		*vptr;
445 
446 	/* figure out the number of entries that would fit into 16K */
447 	nparts = EFI_MIN_ARRAY_SIZE / sizeof (efi_gpe_t);
448 	length = (int) sizeof (struct dk_gpt) +
449 	    (int) sizeof (struct dk_part) * (nparts - 1);
450 	vptr = calloc(1, length);
451 
452 	if (vptr == NULL)
453 		return (VT_ERROR);
454 
455 	vptr->efi_nparts = nparts;
456 	rval = efi_read(fd, vptr);
457 
458 	if ((rval == VT_EINVAL) && vptr->efi_nparts > nparts) {
459 		void *tmp;
460 		length = (int) sizeof (struct dk_gpt) +
461 		    (int) sizeof (struct dk_part) * (vptr->efi_nparts - 1);
462 		nparts = vptr->efi_nparts;
463 		if ((tmp = realloc(vptr, length)) == NULL) {
464 			/* cppcheck-suppress doubleFree */
465 			free(vptr);
466 			*vtoc = NULL;
467 			return (VT_ERROR);
468 		} else {
469 			vptr = tmp;
470 			rval = efi_read(fd, vptr);
471 		}
472 	}
473 
474 	if (rval < 0) {
475 		if (efi_debug) {
476 			(void) fprintf(stderr,
477 			    "read of EFI table failed, rval=%d\n", rval);
478 		}
479 		free(vptr);
480 		*vtoc = NULL;
481 	} else {
482 		*vtoc = vptr;
483 	}
484 
485 	return (rval);
486 }
487 
488 static int
489 efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc)
490 {
491 	void *data = dk_ioc->dki_data;
492 	int error;
493 	diskaddr_t capacity;
494 	uint_t lbsize;
495 
496 	/*
497 	 * When the IO is not being performed in kernel as an ioctl we need
498 	 * to know the sector size so we can seek to the proper byte offset.
499 	 */
500 	if (read_disk_info(fd, &capacity, &lbsize) == -1) {
501 		if (efi_debug)
502 			fprintf(stderr, "unable to read disk info: %d", errno);
503 
504 		errno = EIO;
505 		return (-1);
506 	}
507 
508 	switch (cmd) {
509 	case DKIOCGETEFI:
510 		if (lbsize == 0) {
511 			if (efi_debug)
512 				(void) fprintf(stderr, "DKIOCGETEFI assuming "
513 				    "LBA %d bytes\n", DEV_BSIZE);
514 
515 			lbsize = DEV_BSIZE;
516 		}
517 
518 		error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET);
519 		if (error == -1) {
520 			if (efi_debug)
521 				(void) fprintf(stderr, "DKIOCGETEFI lseek "
522 				    "error: %d\n", errno);
523 			return (error);
524 		}
525 
526 		error = read(fd, data, dk_ioc->dki_length);
527 		if (error == -1) {
528 			if (efi_debug)
529 				(void) fprintf(stderr, "DKIOCGETEFI read "
530 				    "error: %d\n", errno);
531 			return (error);
532 		}
533 
534 		if (error != dk_ioc->dki_length) {
535 			if (efi_debug)
536 				(void) fprintf(stderr, "DKIOCGETEFI short "
537 				    "read of %d bytes\n", error);
538 			errno = EIO;
539 			return (-1);
540 		}
541 		error = 0;
542 		break;
543 
544 	case DKIOCSETEFI:
545 		if (lbsize == 0) {
546 			if (efi_debug)
547 				(void) fprintf(stderr, "DKIOCSETEFI unknown "
548 				    "LBA size\n");
549 			errno = EIO;
550 			return (-1);
551 		}
552 
553 		error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET);
554 		if (error == -1) {
555 			if (efi_debug)
556 				(void) fprintf(stderr, "DKIOCSETEFI lseek "
557 				    "error: %d\n", errno);
558 			return (error);
559 		}
560 
561 		error = write(fd, data, dk_ioc->dki_length);
562 		if (error == -1) {
563 			if (efi_debug)
564 				(void) fprintf(stderr, "DKIOCSETEFI write "
565 				    "error: %d\n", errno);
566 			return (error);
567 		}
568 
569 		if (error != dk_ioc->dki_length) {
570 			if (efi_debug)
571 				(void) fprintf(stderr, "DKIOCSETEFI short "
572 				    "write of %d bytes\n", error);
573 			errno = EIO;
574 			return (-1);
575 		}
576 
577 		/* Sync the new EFI table to disk */
578 		error = fsync(fd);
579 		if (error == -1)
580 			return (error);
581 
582 		/* Ensure any local disk cache is also flushed */
583 		if (ioctl(fd, BLKFLSBUF, 0) == -1)
584 			return (error);
585 
586 		error = 0;
587 		break;
588 
589 	default:
590 		if (efi_debug)
591 			(void) fprintf(stderr, "unsupported ioctl()\n");
592 
593 		errno = EIO;
594 		return (-1);
595 	}
596 
597 	return (error);
598 }
599 
600 int
601 efi_rescan(int fd)
602 {
603 	int retry = 10;
604 	int error;
605 
606 	/* Notify the kernel a devices partition table has been updated */
607 	while ((error = ioctl(fd, BLKRRPART)) != 0) {
608 		if ((--retry == 0) || (errno != EBUSY)) {
609 			(void) fprintf(stderr, "the kernel failed to rescan "
610 			    "the partition table: %d\n", errno);
611 			return (-1);
612 		}
613 		usleep(50000);
614 	}
615 
616 	return (0);
617 }
618 
619 static int
620 check_label(int fd, dk_efi_t *dk_ioc)
621 {
622 	efi_gpt_t		*efi;
623 	uint_t			crc;
624 
625 	if (efi_ioctl(fd, DKIOCGETEFI, dk_ioc) == -1) {
626 		switch (errno) {
627 		case EIO:
628 			return (VT_EIO);
629 		default:
630 			return (VT_ERROR);
631 		}
632 	}
633 	efi = dk_ioc->dki_data;
634 	if (efi->efi_gpt_Signature != LE_64(EFI_SIGNATURE)) {
635 		if (efi_debug)
636 			(void) fprintf(stderr,
637 			    "Bad EFI signature: 0x%llx != 0x%llx\n",
638 			    (long long)efi->efi_gpt_Signature,
639 			    (long long)LE_64(EFI_SIGNATURE));
640 		return (VT_EINVAL);
641 	}
642 
643 	/*
644 	 * check CRC of the header; the size of the header should
645 	 * never be larger than one block
646 	 */
647 	crc = efi->efi_gpt_HeaderCRC32;
648 	efi->efi_gpt_HeaderCRC32 = 0;
649 	len_t headerSize = (len_t)LE_32(efi->efi_gpt_HeaderSize);
650 
651 	if (headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) {
652 		if (efi_debug)
653 			(void) fprintf(stderr,
654 			    "Invalid EFI HeaderSize %llu.  Assuming %d.\n",
655 			    headerSize, EFI_MIN_LABEL_SIZE);
656 	}
657 
658 	if ((headerSize > dk_ioc->dki_length) ||
659 	    crc != LE_32(efi_crc32((unsigned char *)efi, headerSize))) {
660 		if (efi_debug)
661 			(void) fprintf(stderr,
662 			    "Bad EFI CRC: 0x%x != 0x%x\n",
663 			    crc, LE_32(efi_crc32((unsigned char *)efi,
664 			    headerSize)));
665 		return (VT_EINVAL);
666 	}
667 
668 	return (0);
669 }
670 
671 static int
672 efi_read(int fd, struct dk_gpt *vtoc)
673 {
674 	int			i, j;
675 	int			label_len;
676 	int			rval = 0;
677 	int			md_flag = 0;
678 	int			vdc_flag = 0;
679 	diskaddr_t		capacity = 0;
680 	uint_t			lbsize = 0;
681 	struct dk_minfo		disk_info;
682 	dk_efi_t		dk_ioc;
683 	efi_gpt_t		*efi;
684 	efi_gpe_t		*efi_parts;
685 	struct dk_cinfo		dki_info;
686 	uint32_t		user_length;
687 	boolean_t		legacy_label = B_FALSE;
688 
689 	/*
690 	 * get the partition number for this file descriptor.
691 	 */
692 	if ((rval = efi_get_info(fd, &dki_info)) != 0)
693 		return (rval);
694 
695 	if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) &&
696 	    (strncmp(dki_info.dki_dname, "md", 3) == 0)) {
697 		md_flag++;
698 	} else if ((strncmp(dki_info.dki_cname, "vdc", 4) == 0) &&
699 	    (strncmp(dki_info.dki_dname, "vdc", 4) == 0)) {
700 		/*
701 		 * The controller and drive name "vdc" (virtual disk client)
702 		 * indicates a LDoms virtual disk.
703 		 */
704 		vdc_flag++;
705 	}
706 
707 	/* get the LBA size */
708 	if (read_disk_info(fd, &capacity, &lbsize) == -1) {
709 		if (efi_debug) {
710 			(void) fprintf(stderr,
711 			    "unable to read disk info: %d",
712 			    errno);
713 		}
714 		return (VT_EINVAL);
715 	}
716 
717 	disk_info.dki_lbsize = lbsize;
718 	disk_info.dki_capacity = capacity;
719 
720 	if (disk_info.dki_lbsize == 0) {
721 		if (efi_debug) {
722 			(void) fprintf(stderr,
723 			    "efi_read: assuming LBA 512 bytes\n");
724 		}
725 		disk_info.dki_lbsize = DEV_BSIZE;
726 	}
727 	/*
728 	 * Read the EFI GPT to figure out how many partitions we need
729 	 * to deal with.
730 	 */
731 	dk_ioc.dki_lba = 1;
732 	if (NBLOCKS(vtoc->efi_nparts, disk_info.dki_lbsize) < 34) {
733 		label_len = EFI_MIN_ARRAY_SIZE + disk_info.dki_lbsize;
734 	} else {
735 		label_len = vtoc->efi_nparts * (int) sizeof (efi_gpe_t) +
736 		    disk_info.dki_lbsize;
737 		if (label_len % disk_info.dki_lbsize) {
738 			/* pad to physical sector size */
739 			label_len += disk_info.dki_lbsize;
740 			label_len &= ~(disk_info.dki_lbsize - 1);
741 		}
742 	}
743 
744 	if (posix_memalign((void **)&dk_ioc.dki_data,
745 	    disk_info.dki_lbsize, label_len))
746 		return (VT_ERROR);
747 
748 	memset(dk_ioc.dki_data, 0, label_len);
749 	dk_ioc.dki_length = disk_info.dki_lbsize;
750 	user_length = vtoc->efi_nparts;
751 	efi = dk_ioc.dki_data;
752 	if (md_flag) {
753 		dk_ioc.dki_length = label_len;
754 		if (efi_ioctl(fd, DKIOCGETEFI, &dk_ioc) == -1) {
755 			switch (errno) {
756 			case EIO:
757 				return (VT_EIO);
758 			default:
759 				return (VT_ERROR);
760 			}
761 		}
762 	} else if ((rval = check_label(fd, &dk_ioc)) == VT_EINVAL) {
763 		/*
764 		 * No valid label here; try the alternate. Note that here
765 		 * we just read GPT header and save it into dk_ioc.data,
766 		 * Later, we will read GUID partition entry array if we
767 		 * can get valid GPT header.
768 		 */
769 
770 		/*
771 		 * This is a workaround for legacy systems. In the past, the
772 		 * last sector of SCSI disk was invisible on x86 platform. At
773 		 * that time, backup label was saved on the next to the last
774 		 * sector. It is possible for users to move a disk from previous
775 		 * solaris system to present system. Here, we attempt to search
776 		 * legacy backup EFI label first.
777 		 */
778 		dk_ioc.dki_lba = disk_info.dki_capacity - 2;
779 		dk_ioc.dki_length = disk_info.dki_lbsize;
780 		rval = check_label(fd, &dk_ioc);
781 		if (rval == VT_EINVAL) {
782 			/*
783 			 * we didn't find legacy backup EFI label, try to
784 			 * search backup EFI label in the last block.
785 			 */
786 			dk_ioc.dki_lba = disk_info.dki_capacity - 1;
787 			dk_ioc.dki_length = disk_info.dki_lbsize;
788 			rval = check_label(fd, &dk_ioc);
789 			if (rval == 0) {
790 				legacy_label = B_TRUE;
791 				if (efi_debug)
792 					(void) fprintf(stderr,
793 					    "efi_read: primary label corrupt; "
794 					    "using EFI backup label located on"
795 					    " the last block\n");
796 			}
797 		} else {
798 			if ((efi_debug) && (rval == 0))
799 				(void) fprintf(stderr, "efi_read: primary label"
800 				    " corrupt; using legacy EFI backup label "
801 				    " located on the next to last block\n");
802 		}
803 
804 		if (rval == 0) {
805 			dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA);
806 			vtoc->efi_flags |= EFI_GPT_PRIMARY_CORRUPT;
807 			vtoc->efi_nparts =
808 			    LE_32(efi->efi_gpt_NumberOfPartitionEntries);
809 			/*
810 			 * Partition tables are between backup GPT header
811 			 * table and ParitionEntryLBA (the starting LBA of
812 			 * the GUID partition entries array). Now that we
813 			 * already got valid GPT header and saved it in
814 			 * dk_ioc.dki_data, we try to get GUID partition
815 			 * entry array here.
816 			 */
817 			/* LINTED */
818 			dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data
819 			    + disk_info.dki_lbsize);
820 			if (legacy_label)
821 				dk_ioc.dki_length = disk_info.dki_capacity - 1 -
822 				    dk_ioc.dki_lba;
823 			else
824 				dk_ioc.dki_length = disk_info.dki_capacity - 2 -
825 				    dk_ioc.dki_lba;
826 			dk_ioc.dki_length *= disk_info.dki_lbsize;
827 			if (dk_ioc.dki_length >
828 			    ((len_t)label_len - sizeof (*dk_ioc.dki_data))) {
829 				rval = VT_EINVAL;
830 			} else {
831 				/*
832 				 * read GUID partition entry array
833 				 */
834 				rval = efi_ioctl(fd, DKIOCGETEFI, &dk_ioc);
835 			}
836 		}
837 
838 	} else if (rval == 0) {
839 
840 		dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA);
841 		/* LINTED */
842 		dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data
843 		    + disk_info.dki_lbsize);
844 		dk_ioc.dki_length = label_len - disk_info.dki_lbsize;
845 		rval = efi_ioctl(fd, DKIOCGETEFI, &dk_ioc);
846 
847 	} else if (vdc_flag && rval == VT_ERROR && errno == EINVAL) {
848 		/*
849 		 * When the device is a LDoms virtual disk, the DKIOCGETEFI
850 		 * ioctl can fail with EINVAL if the virtual disk backend
851 		 * is a ZFS volume serviced by a domain running an old version
852 		 * of Solaris. This is because the DKIOCGETEFI ioctl was
853 		 * initially incorrectly implemented for a ZFS volume and it
854 		 * expected the GPT and GPE to be retrieved with a single ioctl.
855 		 * So we try to read the GPT and the GPE using that old style
856 		 * ioctl.
857 		 */
858 		dk_ioc.dki_lba = 1;
859 		dk_ioc.dki_length = label_len;
860 		rval = check_label(fd, &dk_ioc);
861 	}
862 
863 	if (rval < 0) {
864 		free(efi);
865 		return (rval);
866 	}
867 
868 	/* LINTED -- always longlong aligned */
869 	efi_parts = (efi_gpe_t *)(((char *)efi) + disk_info.dki_lbsize);
870 
871 	/*
872 	 * Assemble this into a "dk_gpt" struct for easier
873 	 * digestibility by applications.
874 	 */
875 	vtoc->efi_version = LE_32(efi->efi_gpt_Revision);
876 	vtoc->efi_nparts = LE_32(efi->efi_gpt_NumberOfPartitionEntries);
877 	vtoc->efi_part_size = LE_32(efi->efi_gpt_SizeOfPartitionEntry);
878 	vtoc->efi_lbasize = disk_info.dki_lbsize;
879 	vtoc->efi_last_lba = disk_info.dki_capacity - 1;
880 	vtoc->efi_first_u_lba = LE_64(efi->efi_gpt_FirstUsableLBA);
881 	vtoc->efi_last_u_lba = LE_64(efi->efi_gpt_LastUsableLBA);
882 	vtoc->efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
883 	UUID_LE_CONVERT(vtoc->efi_disk_uguid, efi->efi_gpt_DiskGUID);
884 
885 	/*
886 	 * If the array the user passed in is too small, set the length
887 	 * to what it needs to be and return
888 	 */
889 	if (user_length < vtoc->efi_nparts) {
890 		return (VT_EINVAL);
891 	}
892 
893 	for (i = 0; i < vtoc->efi_nparts; i++) {
894 
895 		UUID_LE_CONVERT(vtoc->efi_parts[i].p_guid,
896 		    efi_parts[i].efi_gpe_PartitionTypeGUID);
897 
898 		for (j = 0;
899 		    j < sizeof (conversion_array)
900 		    / sizeof (struct uuid_to_ptag); j++) {
901 
902 			if (bcmp(&vtoc->efi_parts[i].p_guid,
903 			    &conversion_array[j].uuid,
904 			    sizeof (struct uuid)) == 0) {
905 				vtoc->efi_parts[i].p_tag = j;
906 				break;
907 			}
908 		}
909 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED)
910 			continue;
911 		vtoc->efi_parts[i].p_flag =
912 		    LE_16(efi_parts[i].efi_gpe_Attributes.PartitionAttrs);
913 		vtoc->efi_parts[i].p_start =
914 		    LE_64(efi_parts[i].efi_gpe_StartingLBA);
915 		vtoc->efi_parts[i].p_size =
916 		    LE_64(efi_parts[i].efi_gpe_EndingLBA) -
917 		    vtoc->efi_parts[i].p_start + 1;
918 		for (j = 0; j < EFI_PART_NAME_LEN; j++) {
919 			vtoc->efi_parts[i].p_name[j] =
920 			    (uchar_t)LE_16(
921 			    efi_parts[i].efi_gpe_PartitionName[j]);
922 		}
923 
924 		UUID_LE_CONVERT(vtoc->efi_parts[i].p_uguid,
925 		    efi_parts[i].efi_gpe_UniquePartitionGUID);
926 	}
927 	free(efi);
928 
929 	return (dki_info.dki_partition);
930 }
931 
932 /* writes a "protective" MBR */
933 static int
934 write_pmbr(int fd, struct dk_gpt *vtoc)
935 {
936 	dk_efi_t	dk_ioc;
937 	struct mboot	mb;
938 	uchar_t		*cp;
939 	diskaddr_t	size_in_lba;
940 	uchar_t		*buf;
941 	int		len;
942 
943 	len = (vtoc->efi_lbasize == 0) ? sizeof (mb) : vtoc->efi_lbasize;
944 	if (posix_memalign((void **)&buf, len, len))
945 		return (VT_ERROR);
946 
947 	/*
948 	 * Preserve any boot code and disk signature if the first block is
949 	 * already an MBR.
950 	 */
951 	memset(buf, 0, len);
952 	dk_ioc.dki_lba = 0;
953 	dk_ioc.dki_length = len;
954 	/* LINTED -- always longlong aligned */
955 	dk_ioc.dki_data = (efi_gpt_t *)buf;
956 	if (efi_ioctl(fd, DKIOCGETEFI, &dk_ioc) == -1) {
957 		(void) memcpy(&mb, buf, sizeof (mb));
958 		bzero(&mb, sizeof (mb));
959 		mb.signature = LE_16(MBB_MAGIC);
960 	} else {
961 		(void) memcpy(&mb, buf, sizeof (mb));
962 		if (mb.signature != LE_16(MBB_MAGIC)) {
963 			bzero(&mb, sizeof (mb));
964 			mb.signature = LE_16(MBB_MAGIC);
965 		}
966 	}
967 
968 	bzero(&mb.parts, sizeof (mb.parts));
969 	cp = (uchar_t *)&mb.parts[0];
970 	/* bootable or not */
971 	*cp++ = 0;
972 	/* beginning CHS; 0xffffff if not representable */
973 	*cp++ = 0xff;
974 	*cp++ = 0xff;
975 	*cp++ = 0xff;
976 	/* OS type */
977 	*cp++ = EFI_PMBR;
978 	/* ending CHS; 0xffffff if not representable */
979 	*cp++ = 0xff;
980 	*cp++ = 0xff;
981 	*cp++ = 0xff;
982 	/* starting LBA: 1 (little endian format) by EFI definition */
983 	*cp++ = 0x01;
984 	*cp++ = 0x00;
985 	*cp++ = 0x00;
986 	*cp++ = 0x00;
987 	/* ending LBA: last block on the disk (little endian format) */
988 	size_in_lba = vtoc->efi_last_lba;
989 	if (size_in_lba < 0xffffffff) {
990 		*cp++ = (size_in_lba & 0x000000ff);
991 		*cp++ = (size_in_lba & 0x0000ff00) >> 8;
992 		*cp++ = (size_in_lba & 0x00ff0000) >> 16;
993 		*cp++ = (size_in_lba & 0xff000000) >> 24;
994 	} else {
995 		*cp++ = 0xff;
996 		*cp++ = 0xff;
997 		*cp++ = 0xff;
998 		*cp++ = 0xff;
999 	}
1000 
1001 	(void) memcpy(buf, &mb, sizeof (mb));
1002 	/* LINTED -- always longlong aligned */
1003 	dk_ioc.dki_data = (efi_gpt_t *)buf;
1004 	dk_ioc.dki_lba = 0;
1005 	dk_ioc.dki_length = len;
1006 	if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) {
1007 		free(buf);
1008 		switch (errno) {
1009 		case EIO:
1010 			return (VT_EIO);
1011 		case EINVAL:
1012 			return (VT_EINVAL);
1013 		default:
1014 			return (VT_ERROR);
1015 		}
1016 	}
1017 	free(buf);
1018 	return (0);
1019 }
1020 
1021 /* make sure the user specified something reasonable */
1022 static int
1023 check_input(struct dk_gpt *vtoc)
1024 {
1025 	int			resv_part = -1;
1026 	int			i, j;
1027 	diskaddr_t		istart, jstart, isize, jsize, endsect;
1028 
1029 	/*
1030 	 * Sanity-check the input (make sure no partitions overlap)
1031 	 */
1032 	for (i = 0; i < vtoc->efi_nparts; i++) {
1033 		/* It can't be unassigned and have an actual size */
1034 		if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) &&
1035 		    (vtoc->efi_parts[i].p_size != 0)) {
1036 			if (efi_debug) {
1037 				(void) fprintf(stderr, "partition %d is "
1038 				    "\"unassigned\" but has a size of %llu",
1039 				    i, vtoc->efi_parts[i].p_size);
1040 			}
1041 			return (VT_EINVAL);
1042 		}
1043 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) {
1044 			if (uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
1045 				continue;
1046 			/* we have encountered an unknown uuid */
1047 			vtoc->efi_parts[i].p_tag = 0xff;
1048 		}
1049 		if (vtoc->efi_parts[i].p_tag == V_RESERVED) {
1050 			if (resv_part != -1) {
1051 				if (efi_debug) {
1052 					(void) fprintf(stderr, "found "
1053 					    "duplicate reserved partition "
1054 					    "at %d\n", i);
1055 				}
1056 				return (VT_EINVAL);
1057 			}
1058 			resv_part = i;
1059 		}
1060 		if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) ||
1061 		    (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) {
1062 			if (efi_debug) {
1063 				(void) fprintf(stderr,
1064 				    "Partition %d starts at %llu.  ",
1065 				    i,
1066 				    vtoc->efi_parts[i].p_start);
1067 				(void) fprintf(stderr,
1068 				    "It must be between %llu and %llu.\n",
1069 				    vtoc->efi_first_u_lba,
1070 				    vtoc->efi_last_u_lba);
1071 			}
1072 			return (VT_EINVAL);
1073 		}
1074 		if ((vtoc->efi_parts[i].p_start +
1075 		    vtoc->efi_parts[i].p_size <
1076 		    vtoc->efi_first_u_lba) ||
1077 		    (vtoc->efi_parts[i].p_start +
1078 		    vtoc->efi_parts[i].p_size >
1079 		    vtoc->efi_last_u_lba + 1)) {
1080 			if (efi_debug) {
1081 				(void) fprintf(stderr,
1082 				    "Partition %d ends at %llu.  ",
1083 				    i,
1084 				    vtoc->efi_parts[i].p_start +
1085 				    vtoc->efi_parts[i].p_size);
1086 				(void) fprintf(stderr,
1087 				    "It must be between %llu and %llu.\n",
1088 				    vtoc->efi_first_u_lba,
1089 				    vtoc->efi_last_u_lba);
1090 			}
1091 			return (VT_EINVAL);
1092 		}
1093 
1094 		for (j = 0; j < vtoc->efi_nparts; j++) {
1095 			isize = vtoc->efi_parts[i].p_size;
1096 			jsize = vtoc->efi_parts[j].p_size;
1097 			istart = vtoc->efi_parts[i].p_start;
1098 			jstart = vtoc->efi_parts[j].p_start;
1099 			if ((i != j) && (isize != 0) && (jsize != 0)) {
1100 				endsect = jstart + jsize -1;
1101 				if ((jstart <= istart) &&
1102 				    (istart <= endsect)) {
1103 					if (efi_debug) {
1104 						(void) fprintf(stderr,
1105 						    "Partition %d overlaps "
1106 						    "partition %d.", i, j);
1107 					}
1108 					return (VT_EINVAL);
1109 				}
1110 			}
1111 		}
1112 	}
1113 	/* just a warning for now */
1114 	if ((resv_part == -1) && efi_debug) {
1115 		(void) fprintf(stderr,
1116 		    "no reserved partition found\n");
1117 	}
1118 	return (0);
1119 }
1120 
1121 static int
1122 call_blkpg_ioctl(int fd, int command, diskaddr_t start,
1123     diskaddr_t size, uint_t pno)
1124 {
1125 	struct blkpg_ioctl_arg ioctl_arg;
1126 	struct blkpg_partition  linux_part;
1127 	memset(&linux_part, 0, sizeof (linux_part));
1128 
1129 	char *path = efi_get_devname(fd);
1130 	if (path == NULL) {
1131 		(void) fprintf(stderr, "failed to retrieve device name\n");
1132 		return (VT_EINVAL);
1133 	}
1134 
1135 	linux_part.start = start;
1136 	linux_part.length = size;
1137 	linux_part.pno = pno;
1138 	snprintf(linux_part.devname, BLKPG_DEVNAMELTH - 1, "%s%u", path, pno);
1139 	linux_part.devname[BLKPG_DEVNAMELTH - 1] = '\0';
1140 	free(path);
1141 
1142 	ioctl_arg.op = command;
1143 	ioctl_arg.flags = 0;
1144 	ioctl_arg.datalen = sizeof (struct blkpg_partition);
1145 	ioctl_arg.data = &linux_part;
1146 
1147 	return (ioctl(fd, BLKPG, &ioctl_arg));
1148 }
1149 
1150 /*
1151  * add all the unallocated space to the current label
1152  */
1153 int
1154 efi_use_whole_disk(int fd)
1155 {
1156 	struct dk_gpt *efi_label = NULL;
1157 	int rval;
1158 	int i;
1159 	uint_t resv_index = 0, data_index = 0;
1160 	diskaddr_t resv_start = 0, data_start = 0;
1161 	diskaddr_t data_size, limit, difference;
1162 	boolean_t sync_needed = B_FALSE;
1163 	uint_t nblocks;
1164 
1165 	rval = efi_alloc_and_read(fd, &efi_label);
1166 	if (rval < 0) {
1167 		if (efi_label != NULL)
1168 			efi_free(efi_label);
1169 		return (rval);
1170 	}
1171 
1172 	/*
1173 	 * Find the last physically non-zero partition.
1174 	 * This should be the reserved partition.
1175 	 */
1176 	for (i = 0; i < efi_label->efi_nparts; i ++) {
1177 		if (resv_start < efi_label->efi_parts[i].p_start) {
1178 			resv_start = efi_label->efi_parts[i].p_start;
1179 			resv_index = i;
1180 		}
1181 	}
1182 
1183 	/*
1184 	 * Find the last physically non-zero partition before that.
1185 	 * This is the data partition.
1186 	 */
1187 	for (i = 0; i < resv_index; i ++) {
1188 		if (data_start < efi_label->efi_parts[i].p_start) {
1189 			data_start = efi_label->efi_parts[i].p_start;
1190 			data_index = i;
1191 		}
1192 	}
1193 	data_size = efi_label->efi_parts[data_index].p_size;
1194 
1195 	/*
1196 	 * See the "efi_alloc_and_init" function for more information
1197 	 * about where this "nblocks" value comes from.
1198 	 */
1199 	nblocks = efi_label->efi_first_u_lba - 1;
1200 
1201 	/*
1202 	 * Determine if the EFI label is out of sync. We check that:
1203 	 *
1204 	 * 1. the data partition ends at the limit we set, and
1205 	 * 2. the reserved partition starts at the limit we set.
1206 	 *
1207 	 * If either of these conditions is not met, then we need to
1208 	 * resync the EFI label.
1209 	 *
1210 	 * The limit is the last usable LBA, determined by the last LBA
1211 	 * and the first usable LBA fields on the EFI label of the disk
1212 	 * (see the lines directly above). Additionally, we factor in
1213 	 * EFI_MIN_RESV_SIZE (per its use in "zpool_label_disk") and
1214 	 * P2ALIGN it to ensure the partition boundaries are aligned
1215 	 * (for performance reasons). The alignment should match the
1216 	 * alignment used by the "zpool_label_disk" function.
1217 	 */
1218 	limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE,
1219 	    PARTITION_END_ALIGNMENT);
1220 	if (data_start + data_size != limit || resv_start != limit)
1221 		sync_needed = B_TRUE;
1222 
1223 	if (efi_debug && sync_needed)
1224 		(void) fprintf(stderr, "efi_use_whole_disk: sync needed\n");
1225 
1226 	/*
1227 	 * If alter_lba is 1, we are using the backup label.
1228 	 * Since we can locate the backup label by disk capacity,
1229 	 * there must be no unallocated space.
1230 	 */
1231 	if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba
1232 	    >= efi_label->efi_last_lba && !sync_needed)) {
1233 		if (efi_debug) {
1234 			(void) fprintf(stderr,
1235 			    "efi_use_whole_disk: requested space not found\n");
1236 		}
1237 		efi_free(efi_label);
1238 		return (VT_ENOSPC);
1239 	}
1240 
1241 	/*
1242 	 * Verify that we've found the reserved partition by checking
1243 	 * that it looks the way it did when we created it in zpool_label_disk.
1244 	 * If we've found the incorrect partition, then we know that this
1245 	 * device was reformatted and no longer is solely used by ZFS.
1246 	 */
1247 	if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) ||
1248 	    (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) ||
1249 	    (resv_index != 8)) {
1250 		if (efi_debug) {
1251 			(void) fprintf(stderr,
1252 			    "efi_use_whole_disk: wholedisk not available\n");
1253 		}
1254 		efi_free(efi_label);
1255 		return (VT_ENOSPC);
1256 	}
1257 
1258 	if (data_start + data_size != resv_start) {
1259 		if (efi_debug) {
1260 			(void) fprintf(stderr,
1261 			    "efi_use_whole_disk: "
1262 			    "data_start (%lli) + "
1263 			    "data_size (%lli) != "
1264 			    "resv_start (%lli)\n",
1265 			    data_start, data_size, resv_start);
1266 		}
1267 
1268 		return (VT_EINVAL);
1269 	}
1270 
1271 	if (limit < resv_start) {
1272 		if (efi_debug) {
1273 			(void) fprintf(stderr,
1274 			    "efi_use_whole_disk: "
1275 			    "limit (%lli) < resv_start (%lli)\n",
1276 			    limit, resv_start);
1277 		}
1278 
1279 		return (VT_EINVAL);
1280 	}
1281 
1282 	difference = limit - resv_start;
1283 
1284 	if (efi_debug)
1285 		(void) fprintf(stderr,
1286 		    "efi_use_whole_disk: difference is %lli\n", difference);
1287 
1288 	/*
1289 	 * Move the reserved partition. There is currently no data in
1290 	 * here except fabricated devids (which get generated via
1291 	 * efi_write()). So there is no need to copy data.
1292 	 */
1293 	efi_label->efi_parts[data_index].p_size += difference;
1294 	efi_label->efi_parts[resv_index].p_start += difference;
1295 	efi_label->efi_last_u_lba = efi_label->efi_last_lba - nblocks;
1296 
1297 	/*
1298 	 * Rescanning the partition table in the kernel can result
1299 	 * in the device links to be removed (see comment in vdev_disk_open).
1300 	 * If BLKPG_RESIZE_PARTITION is available, then we can resize
1301 	 * the partition table online and avoid having to remove the device
1302 	 * links used by the pool. This provides a very deterministic
1303 	 * approach to resizing devices and does not require any
1304 	 * loops waiting for devices to reappear.
1305 	 */
1306 #ifdef BLKPG_RESIZE_PARTITION
1307 	/*
1308 	 * Delete the reserved partition since we're about to expand
1309 	 * the data partition and it would overlap with the reserved
1310 	 * partition.
1311 	 * NOTE: The starting index for the ioctl is 1 while for the
1312 	 * EFI partitions it's 0. For that reason we have to add one
1313 	 * whenever we make an ioctl call.
1314 	 */
1315 	rval = call_blkpg_ioctl(fd, BLKPG_DEL_PARTITION, 0, 0, resv_index + 1);
1316 	if (rval != 0)
1317 		goto out;
1318 
1319 	/*
1320 	 * Expand the data partition
1321 	 */
1322 	rval = call_blkpg_ioctl(fd, BLKPG_RESIZE_PARTITION,
1323 	    efi_label->efi_parts[data_index].p_start * efi_label->efi_lbasize,
1324 	    efi_label->efi_parts[data_index].p_size * efi_label->efi_lbasize,
1325 	    data_index + 1);
1326 	if (rval != 0) {
1327 		(void) fprintf(stderr, "Unable to resize data "
1328 		    "partition:  %d\n", rval);
1329 		/*
1330 		 * Since we failed to resize, we need to reset the start
1331 		 * of the reserve partition and re-create it.
1332 		 */
1333 		efi_label->efi_parts[resv_index].p_start -= difference;
1334 	}
1335 
1336 	/*
1337 	 * Re-add the reserved partition. If we've expanded the data partition
1338 	 * then we'll move the reserve partition to the end of the data
1339 	 * partition. Otherwise, we'll recreate the partition in its original
1340 	 * location. Note that we do this as best-effort and ignore any
1341 	 * errors that may arise here. This will ensure that we finish writing
1342 	 * the EFI label.
1343 	 */
1344 	(void) call_blkpg_ioctl(fd, BLKPG_ADD_PARTITION,
1345 	    efi_label->efi_parts[resv_index].p_start * efi_label->efi_lbasize,
1346 	    efi_label->efi_parts[resv_index].p_size * efi_label->efi_lbasize,
1347 	    resv_index + 1);
1348 #endif
1349 
1350 	/*
1351 	 * We're now ready to write the EFI label.
1352 	 */
1353 	if (rval == 0) {
1354 		rval = efi_write(fd, efi_label);
1355 		if (rval < 0 && efi_debug) {
1356 			(void) fprintf(stderr, "efi_use_whole_disk:fail "
1357 			    "to write label, rval=%d\n", rval);
1358 		}
1359 	}
1360 
1361 out:
1362 	efi_free(efi_label);
1363 	return (rval);
1364 }
1365 
1366 /*
1367  * write EFI label and backup label
1368  */
1369 int
1370 efi_write(int fd, struct dk_gpt *vtoc)
1371 {
1372 	dk_efi_t		dk_ioc;
1373 	efi_gpt_t		*efi;
1374 	efi_gpe_t		*efi_parts;
1375 	int			i, j;
1376 	struct dk_cinfo		dki_info;
1377 	int			rval;
1378 	int			md_flag = 0;
1379 	int			nblocks;
1380 	diskaddr_t		lba_backup_gpt_hdr;
1381 
1382 	if ((rval = efi_get_info(fd, &dki_info)) != 0)
1383 		return (rval);
1384 
1385 	/* check if we are dealing with a metadevice */
1386 	if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) &&
1387 	    (strncmp(dki_info.dki_dname, "md", 3) == 0)) {
1388 		md_flag = 1;
1389 	}
1390 
1391 	if (check_input(vtoc)) {
1392 		/*
1393 		 * not valid; if it's a metadevice just pass it down
1394 		 * because SVM will do its own checking
1395 		 */
1396 		if (md_flag == 0) {
1397 			return (VT_EINVAL);
1398 		}
1399 	}
1400 
1401 	dk_ioc.dki_lba = 1;
1402 	if (NBLOCKS(vtoc->efi_nparts, vtoc->efi_lbasize) < 34) {
1403 		dk_ioc.dki_length = EFI_MIN_ARRAY_SIZE + vtoc->efi_lbasize;
1404 	} else {
1405 		dk_ioc.dki_length = NBLOCKS(vtoc->efi_nparts,
1406 		    vtoc->efi_lbasize) *
1407 		    vtoc->efi_lbasize;
1408 	}
1409 
1410 	/*
1411 	 * the number of blocks occupied by GUID partition entry array
1412 	 */
1413 	nblocks = dk_ioc.dki_length / vtoc->efi_lbasize - 1;
1414 
1415 	/*
1416 	 * Backup GPT header is located on the block after GUID
1417 	 * partition entry array. Here, we calculate the address
1418 	 * for backup GPT header.
1419 	 */
1420 	lba_backup_gpt_hdr = vtoc->efi_last_u_lba + 1 + nblocks;
1421 	if (posix_memalign((void **)&dk_ioc.dki_data,
1422 	    vtoc->efi_lbasize, dk_ioc.dki_length))
1423 		return (VT_ERROR);
1424 
1425 	memset(dk_ioc.dki_data, 0, dk_ioc.dki_length);
1426 	efi = dk_ioc.dki_data;
1427 
1428 	/* stuff user's input into EFI struct */
1429 	efi->efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1430 	efi->efi_gpt_Revision = LE_32(vtoc->efi_version); /* 0x02000100 */
1431 	efi->efi_gpt_HeaderSize = LE_32(sizeof (struct efi_gpt) - LEN_EFI_PAD);
1432 	efi->efi_gpt_Reserved1 = 0;
1433 	efi->efi_gpt_MyLBA = LE_64(1ULL);
1434 	efi->efi_gpt_AlternateLBA = LE_64(lba_backup_gpt_hdr);
1435 	efi->efi_gpt_FirstUsableLBA = LE_64(vtoc->efi_first_u_lba);
1436 	efi->efi_gpt_LastUsableLBA = LE_64(vtoc->efi_last_u_lba);
1437 	efi->efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1438 	efi->efi_gpt_NumberOfPartitionEntries = LE_32(vtoc->efi_nparts);
1439 	efi->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (struct efi_gpe));
1440 	UUID_LE_CONVERT(efi->efi_gpt_DiskGUID, vtoc->efi_disk_uguid);
1441 
1442 	/* LINTED -- always longlong aligned */
1443 	efi_parts = (efi_gpe_t *)((char *)dk_ioc.dki_data + vtoc->efi_lbasize);
1444 
1445 	for (i = 0; i < vtoc->efi_nparts; i++) {
1446 		for (j = 0;
1447 		    j < sizeof (conversion_array) /
1448 		    sizeof (struct uuid_to_ptag); j++) {
1449 
1450 			if (vtoc->efi_parts[i].p_tag == j) {
1451 				UUID_LE_CONVERT(
1452 				    efi_parts[i].efi_gpe_PartitionTypeGUID,
1453 				    conversion_array[j].uuid);
1454 				break;
1455 			}
1456 		}
1457 
1458 		if (j == sizeof (conversion_array) /
1459 		    sizeof (struct uuid_to_ptag)) {
1460 			/*
1461 			 * If we didn't have a matching uuid match, bail here.
1462 			 * Don't write a label with unknown uuid.
1463 			 */
1464 			if (efi_debug) {
1465 				(void) fprintf(stderr,
1466 				    "Unknown uuid for p_tag %d\n",
1467 				    vtoc->efi_parts[i].p_tag);
1468 			}
1469 			return (VT_EINVAL);
1470 		}
1471 
1472 		/* Zero's should be written for empty partitions */
1473 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED)
1474 			continue;
1475 
1476 		efi_parts[i].efi_gpe_StartingLBA =
1477 		    LE_64(vtoc->efi_parts[i].p_start);
1478 		efi_parts[i].efi_gpe_EndingLBA =
1479 		    LE_64(vtoc->efi_parts[i].p_start +
1480 		    vtoc->efi_parts[i].p_size - 1);
1481 		efi_parts[i].efi_gpe_Attributes.PartitionAttrs =
1482 		    LE_16(vtoc->efi_parts[i].p_flag);
1483 		for (j = 0; j < EFI_PART_NAME_LEN; j++) {
1484 			efi_parts[i].efi_gpe_PartitionName[j] =
1485 			    LE_16((ushort_t)vtoc->efi_parts[i].p_name[j]);
1486 		}
1487 		if ((vtoc->efi_parts[i].p_tag != V_UNASSIGNED) &&
1488 		    uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_uguid)) {
1489 			(void) uuid_generate((uchar_t *)
1490 			    &vtoc->efi_parts[i].p_uguid);
1491 		}
1492 		bcopy(&vtoc->efi_parts[i].p_uguid,
1493 		    &efi_parts[i].efi_gpe_UniquePartitionGUID,
1494 		    sizeof (uuid_t));
1495 	}
1496 	efi->efi_gpt_PartitionEntryArrayCRC32 =
1497 	    LE_32(efi_crc32((unsigned char *)efi_parts,
1498 	    vtoc->efi_nparts * (int)sizeof (struct efi_gpe)));
1499 	efi->efi_gpt_HeaderCRC32 =
1500 	    LE_32(efi_crc32((unsigned char *)efi,
1501 	    LE_32(efi->efi_gpt_HeaderSize)));
1502 
1503 	if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) {
1504 		free(dk_ioc.dki_data);
1505 		switch (errno) {
1506 		case EIO:
1507 			return (VT_EIO);
1508 		case EINVAL:
1509 			return (VT_EINVAL);
1510 		default:
1511 			return (VT_ERROR);
1512 		}
1513 	}
1514 	/* if it's a metadevice we're done */
1515 	if (md_flag) {
1516 		free(dk_ioc.dki_data);
1517 		return (0);
1518 	}
1519 
1520 	/* write backup partition array */
1521 	dk_ioc.dki_lba = vtoc->efi_last_u_lba + 1;
1522 	dk_ioc.dki_length -= vtoc->efi_lbasize;
1523 	/* LINTED */
1524 	dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data +
1525 	    vtoc->efi_lbasize);
1526 
1527 	if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) {
1528 		/*
1529 		 * we wrote the primary label okay, so don't fail
1530 		 */
1531 		if (efi_debug) {
1532 			(void) fprintf(stderr,
1533 			    "write of backup partitions to block %llu "
1534 			    "failed, errno %d\n",
1535 			    vtoc->efi_last_u_lba + 1,
1536 			    errno);
1537 		}
1538 	}
1539 	/*
1540 	 * now swap MyLBA and AlternateLBA fields and write backup
1541 	 * partition table header
1542 	 */
1543 	dk_ioc.dki_lba = lba_backup_gpt_hdr;
1544 	dk_ioc.dki_length = vtoc->efi_lbasize;
1545 	/* LINTED */
1546 	dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data -
1547 	    vtoc->efi_lbasize);
1548 	efi->efi_gpt_AlternateLBA = LE_64(1ULL);
1549 	efi->efi_gpt_MyLBA = LE_64(lba_backup_gpt_hdr);
1550 	efi->efi_gpt_PartitionEntryLBA = LE_64(vtoc->efi_last_u_lba + 1);
1551 	efi->efi_gpt_HeaderCRC32 = 0;
1552 	efi->efi_gpt_HeaderCRC32 =
1553 	    LE_32(efi_crc32((unsigned char *)dk_ioc.dki_data,
1554 	    LE_32(efi->efi_gpt_HeaderSize)));
1555 
1556 	if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) {
1557 		if (efi_debug) {
1558 			(void) fprintf(stderr,
1559 			    "write of backup header to block %llu failed, "
1560 			    "errno %d\n",
1561 			    lba_backup_gpt_hdr,
1562 			    errno);
1563 		}
1564 	}
1565 	/* write the PMBR */
1566 	(void) write_pmbr(fd, vtoc);
1567 	free(dk_ioc.dki_data);
1568 
1569 	return (0);
1570 }
1571 
1572 void
1573 efi_free(struct dk_gpt *ptr)
1574 {
1575 	free(ptr);
1576 }
1577 
1578 /*
1579  * Input: File descriptor
1580  * Output: 1 if disk has an EFI label, or > 2TB with no VTOC or legacy MBR.
1581  * Otherwise 0.
1582  */
1583 int
1584 efi_type(int fd)
1585 {
1586 #if 0
1587 	struct vtoc vtoc;
1588 	struct extvtoc extvtoc;
1589 
1590 	if (ioctl(fd, DKIOCGEXTVTOC, &extvtoc) == -1) {
1591 		if (errno == ENOTSUP)
1592 			return (1);
1593 		else if (errno == ENOTTY) {
1594 			if (ioctl(fd, DKIOCGVTOC, &vtoc) == -1)
1595 				if (errno == ENOTSUP)
1596 					return (1);
1597 		}
1598 	}
1599 	return (0);
1600 #else
1601 	return (ENOSYS);
1602 #endif
1603 }
1604 
1605 void
1606 efi_err_check(struct dk_gpt *vtoc)
1607 {
1608 	int			resv_part = -1;
1609 	int			i, j;
1610 	diskaddr_t		istart, jstart, isize, jsize, endsect;
1611 	int			overlap = 0;
1612 
1613 	/*
1614 	 * make sure no partitions overlap
1615 	 */
1616 	for (i = 0; i < vtoc->efi_nparts; i++) {
1617 		/* It can't be unassigned and have an actual size */
1618 		if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) &&
1619 		    (vtoc->efi_parts[i].p_size != 0)) {
1620 			(void) fprintf(stderr,
1621 			    "partition %d is \"unassigned\" but has a size "
1622 			    "of %llu\n", i, vtoc->efi_parts[i].p_size);
1623 		}
1624 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) {
1625 			continue;
1626 		}
1627 		if (vtoc->efi_parts[i].p_tag == V_RESERVED) {
1628 			if (resv_part != -1) {
1629 				(void) fprintf(stderr,
1630 				    "found duplicate reserved partition at "
1631 				    "%d\n", i);
1632 			}
1633 			resv_part = i;
1634 			if (vtoc->efi_parts[i].p_size != EFI_MIN_RESV_SIZE)
1635 				(void) fprintf(stderr,
1636 				    "Warning: reserved partition size must "
1637 				    "be %d sectors\n", EFI_MIN_RESV_SIZE);
1638 		}
1639 		if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) ||
1640 		    (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) {
1641 			(void) fprintf(stderr,
1642 			    "Partition %d starts at %llu\n",
1643 			    i,
1644 			    vtoc->efi_parts[i].p_start);
1645 			(void) fprintf(stderr,
1646 			    "It must be between %llu and %llu.\n",
1647 			    vtoc->efi_first_u_lba,
1648 			    vtoc->efi_last_u_lba);
1649 		}
1650 		if ((vtoc->efi_parts[i].p_start +
1651 		    vtoc->efi_parts[i].p_size <
1652 		    vtoc->efi_first_u_lba) ||
1653 		    (vtoc->efi_parts[i].p_start +
1654 		    vtoc->efi_parts[i].p_size >
1655 		    vtoc->efi_last_u_lba + 1)) {
1656 			(void) fprintf(stderr,
1657 			    "Partition %d ends at %llu\n",
1658 			    i,
1659 			    vtoc->efi_parts[i].p_start +
1660 			    vtoc->efi_parts[i].p_size);
1661 			(void) fprintf(stderr,
1662 			    "It must be between %llu and %llu.\n",
1663 			    vtoc->efi_first_u_lba,
1664 			    vtoc->efi_last_u_lba);
1665 		}
1666 
1667 		for (j = 0; j < vtoc->efi_nparts; j++) {
1668 			isize = vtoc->efi_parts[i].p_size;
1669 			jsize = vtoc->efi_parts[j].p_size;
1670 			istart = vtoc->efi_parts[i].p_start;
1671 			jstart = vtoc->efi_parts[j].p_start;
1672 			if ((i != j) && (isize != 0) && (jsize != 0)) {
1673 				endsect = jstart + jsize -1;
1674 				if ((jstart <= istart) &&
1675 				    (istart <= endsect)) {
1676 					if (!overlap) {
1677 					(void) fprintf(stderr,
1678 					    "label error: EFI Labels do not "
1679 					    "support overlapping partitions\n");
1680 					}
1681 					(void) fprintf(stderr,
1682 					    "Partition %d overlaps partition "
1683 					    "%d.\n", i, j);
1684 					overlap = 1;
1685 				}
1686 			}
1687 		}
1688 	}
1689 	/* make sure there is a reserved partition */
1690 	if (resv_part == -1) {
1691 		(void) fprintf(stderr,
1692 		    "no reserved partition found\n");
1693 	}
1694 }
1695 
1696 /*
1697  * We need to get information necessary to construct a *new* efi
1698  * label type
1699  */
1700 int
1701 efi_auto_sense(int fd, struct dk_gpt **vtoc)
1702 {
1703 
1704 	int	i;
1705 
1706 	/*
1707 	 * Now build the default partition table
1708 	 */
1709 	if (efi_alloc_and_init(fd, EFI_NUMPAR, vtoc) != 0) {
1710 		if (efi_debug) {
1711 			(void) fprintf(stderr, "efi_alloc_and_init failed.\n");
1712 		}
1713 		return (-1);
1714 	}
1715 
1716 	for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) {
1717 		(*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag;
1718 		(*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag;
1719 		(*vtoc)->efi_parts[i].p_start = 0;
1720 		(*vtoc)->efi_parts[i].p_size = 0;
1721 	}
1722 	/*
1723 	 * Make constants first
1724 	 * and variable partitions later
1725 	 */
1726 
1727 	/* root partition - s0 128 MB */
1728 	(*vtoc)->efi_parts[0].p_start = 34;
1729 	(*vtoc)->efi_parts[0].p_size = 262144;
1730 
1731 	/* partition - s1  128 MB */
1732 	(*vtoc)->efi_parts[1].p_start = 262178;
1733 	(*vtoc)->efi_parts[1].p_size = 262144;
1734 
1735 	/* partition -s2 is NOT the Backup disk */
1736 	(*vtoc)->efi_parts[2].p_tag = V_UNASSIGNED;
1737 
1738 	/* partition -s6 /usr partition - HOG */
1739 	(*vtoc)->efi_parts[6].p_start = 524322;
1740 	(*vtoc)->efi_parts[6].p_size = (*vtoc)->efi_last_u_lba - 524322
1741 	    - (1024 * 16);
1742 
1743 	/* efi reserved partition - s9 16K */
1744 	(*vtoc)->efi_parts[8].p_start = (*vtoc)->efi_last_u_lba - (1024 * 16);
1745 	(*vtoc)->efi_parts[8].p_size = (1024 * 16);
1746 	(*vtoc)->efi_parts[8].p_tag = V_RESERVED;
1747 	return (0);
1748 }
1749