xref: /freebsd/stand/libsa/zfs/zfs.c (revision e17f5b1d)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  *	$FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  *	Stand-alone file reading package.
34  */
35 
36 #include <stand.h>
37 #include <sys/disk.h>
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/queue.h>
41 #include <disk.h>
42 #include <part.h>
43 #include <stddef.h>
44 #include <stdarg.h>
45 #include <string.h>
46 #include <bootstrap.h>
47 
48 #include "libzfs.h"
49 
50 #include "zfsimpl.c"
51 
52 /* Define the range of indexes to be populated with ZFS Boot Environments */
53 #define		ZFS_BE_FIRST	4
54 #define		ZFS_BE_LAST	8
55 
56 static int	zfs_open(const char *path, struct open_file *f);
57 static int	zfs_close(struct open_file *f);
58 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
59 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
60 static int	zfs_stat(struct open_file *f, struct stat *sb);
61 static int	zfs_readdir(struct open_file *f, struct dirent *d);
62 
63 static void	zfs_bootenv_initial(const char *);
64 
65 struct devsw zfs_dev;
66 
67 struct fs_ops zfs_fsops = {
68 	"zfs",
69 	zfs_open,
70 	zfs_close,
71 	zfs_read,
72 	null_write,
73 	zfs_seek,
74 	zfs_stat,
75 	zfs_readdir
76 };
77 
78 /*
79  * In-core open file.
80  */
81 struct file {
82 	off_t		f_seekp;	/* seek pointer */
83 	dnode_phys_t	f_dnode;
84 	uint64_t	f_zap_type;	/* zap type for readdir */
85 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
86 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
87 };
88 
89 static int	zfs_env_index;
90 static int	zfs_env_count;
91 
92 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
93 struct zfs_be_list *zfs_be_headp;
94 struct zfs_be_entry {
95 	char *name;
96 	SLIST_ENTRY(zfs_be_entry) entries;
97 } *zfs_be, *zfs_be_tmp;
98 
99 /*
100  * Open a file.
101  */
102 static int
103 zfs_open(const char *upath, struct open_file *f)
104 {
105 	struct zfsmount *mount = (struct zfsmount *)f->f_devdata;
106 	struct file *fp;
107 	int rc;
108 
109 	if (f->f_dev != &zfs_dev)
110 		return (EINVAL);
111 
112 	/* allocate file system specific data structure */
113 	fp = calloc(1, sizeof(struct file));
114 	if (fp == NULL)
115 		return (ENOMEM);
116 	f->f_fsdata = fp;
117 
118 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
119 	fp->f_seekp = 0;
120 	if (rc) {
121 		f->f_fsdata = NULL;
122 		free(fp);
123 	}
124 	return (rc);
125 }
126 
127 static int
128 zfs_close(struct open_file *f)
129 {
130 	struct file *fp = (struct file *)f->f_fsdata;
131 
132 	dnode_cache_obj = NULL;
133 	f->f_fsdata = NULL;
134 
135 	free(fp);
136 	return (0);
137 }
138 
139 /*
140  * Copy a portion of a file into kernel memory.
141  * Cross block boundaries when necessary.
142  */
143 static int
144 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
145 {
146 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
147 	struct file *fp = (struct file *)f->f_fsdata;
148 	struct stat sb;
149 	size_t n;
150 	int rc;
151 
152 	rc = zfs_stat(f, &sb);
153 	if (rc)
154 		return (rc);
155 	n = size;
156 	if (fp->f_seekp + n > sb.st_size)
157 		n = sb.st_size - fp->f_seekp;
158 
159 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
160 	if (rc)
161 		return (rc);
162 
163 	if (0) {
164 	    int i;
165 	    for (i = 0; i < n; i++)
166 		putchar(((char*) start)[i]);
167 	}
168 	fp->f_seekp += n;
169 	if (resid)
170 		*resid = size - n;
171 
172 	return (0);
173 }
174 
175 static off_t
176 zfs_seek(struct open_file *f, off_t offset, int where)
177 {
178 	struct file *fp = (struct file *)f->f_fsdata;
179 
180 	switch (where) {
181 	case SEEK_SET:
182 		fp->f_seekp = offset;
183 		break;
184 	case SEEK_CUR:
185 		fp->f_seekp += offset;
186 		break;
187 	case SEEK_END:
188 	    {
189 		struct stat sb;
190 		int error;
191 
192 		error = zfs_stat(f, &sb);
193 		if (error != 0) {
194 			errno = error;
195 			return (-1);
196 		}
197 		fp->f_seekp = sb.st_size - offset;
198 		break;
199 	    }
200 	default:
201 		errno = EINVAL;
202 		return (-1);
203 	}
204 	return (fp->f_seekp);
205 }
206 
207 static int
208 zfs_stat(struct open_file *f, struct stat *sb)
209 {
210 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
211 	struct file *fp = (struct file *)f->f_fsdata;
212 
213 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
214 }
215 
216 static int
217 zfs_readdir(struct open_file *f, struct dirent *d)
218 {
219 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
220 	struct file *fp = (struct file *)f->f_fsdata;
221 	mzap_ent_phys_t mze;
222 	struct stat sb;
223 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
224 	int rc;
225 
226 	rc = zfs_stat(f, &sb);
227 	if (rc)
228 		return (rc);
229 	if (!S_ISDIR(sb.st_mode))
230 		return (ENOTDIR);
231 
232 	/*
233 	 * If this is the first read, get the zap type.
234 	 */
235 	if (fp->f_seekp == 0) {
236 		rc = dnode_read(spa, &fp->f_dnode,
237 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
238 		if (rc)
239 			return (rc);
240 
241 		if (fp->f_zap_type == ZBT_MICRO) {
242 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
243 		} else {
244 			rc = dnode_read(spa, &fp->f_dnode,
245 					offsetof(zap_phys_t, zap_num_leafs),
246 					&fp->f_num_leafs,
247 					sizeof(fp->f_num_leafs));
248 			if (rc)
249 				return (rc);
250 
251 			fp->f_seekp = bsize;
252 			fp->f_zap_leaf = malloc(bsize);
253 			if (fp->f_zap_leaf == NULL)
254 				return (ENOMEM);
255 			rc = dnode_read(spa, &fp->f_dnode,
256 					fp->f_seekp,
257 					fp->f_zap_leaf,
258 					bsize);
259 			if (rc)
260 				return (rc);
261 		}
262 	}
263 
264 	if (fp->f_zap_type == ZBT_MICRO) {
265 	mzap_next:
266 		if (fp->f_seekp >= bsize)
267 			return (ENOENT);
268 
269 		rc = dnode_read(spa, &fp->f_dnode,
270 				fp->f_seekp, &mze, sizeof(mze));
271 		if (rc)
272 			return (rc);
273 		fp->f_seekp += sizeof(mze);
274 
275 		if (!mze.mze_name[0])
276 			goto mzap_next;
277 
278 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
279 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
280 		strcpy(d->d_name, mze.mze_name);
281 		d->d_namlen = strlen(d->d_name);
282 		return (0);
283 	} else {
284 		zap_leaf_t zl;
285 		zap_leaf_chunk_t *zc, *nc;
286 		int chunk;
287 		size_t namelen;
288 		char *p;
289 		uint64_t value;
290 
291 		/*
292 		 * Initialise this so we can use the ZAP size
293 		 * calculating macros.
294 		 */
295 		zl.l_bs = ilog2(bsize);
296 		zl.l_phys = fp->f_zap_leaf;
297 
298 		/*
299 		 * Figure out which chunk we are currently looking at
300 		 * and consider seeking to the next leaf. We use the
301 		 * low bits of f_seekp as a simple chunk index.
302 		 */
303 	fzap_next:
304 		chunk = fp->f_seekp & (bsize - 1);
305 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
306 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
307 			chunk = 0;
308 
309 			/*
310 			 * Check for EOF and read the new leaf.
311 			 */
312 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
313 				return (ENOENT);
314 
315 			rc = dnode_read(spa, &fp->f_dnode,
316 					fp->f_seekp,
317 					fp->f_zap_leaf,
318 					bsize);
319 			if (rc)
320 				return (rc);
321 		}
322 
323 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
324 		fp->f_seekp++;
325 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
326 			goto fzap_next;
327 
328 		namelen = zc->l_entry.le_name_numints;
329 		if (namelen > sizeof(d->d_name))
330 			namelen = sizeof(d->d_name);
331 
332 		/*
333 		 * Paste the name back together.
334 		 */
335 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
336 		p = d->d_name;
337 		while (namelen > 0) {
338 			int len;
339 			len = namelen;
340 			if (len > ZAP_LEAF_ARRAY_BYTES)
341 				len = ZAP_LEAF_ARRAY_BYTES;
342 			memcpy(p, nc->l_array.la_array, len);
343 			p += len;
344 			namelen -= len;
345 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
346 		}
347 		d->d_name[sizeof(d->d_name) - 1] = 0;
348 
349 		/*
350 		 * Assume the first eight bytes of the value are
351 		 * a uint64_t.
352 		 */
353 		value = fzap_leaf_value(&zl, zc);
354 
355 		d->d_fileno = ZFS_DIRENT_OBJ(value);
356 		d->d_type = ZFS_DIRENT_TYPE(value);
357 		d->d_namlen = strlen(d->d_name);
358 
359 		return (0);
360 	}
361 }
362 
363 static int
364 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes)
365 {
366 	int fd, ret;
367 	size_t res, head, tail, total_size, full_sec_size;
368 	unsigned secsz, do_tail_read;
369 	off_t start_sec;
370 	char *outbuf, *bouncebuf;
371 
372 	fd = (uintptr_t) priv;
373 	outbuf = (char *) buf;
374 	bouncebuf = NULL;
375 
376 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
377 	if (ret != 0)
378 		return (ret);
379 
380 	/*
381 	 * Handling reads of arbitrary offset and size - multi-sector case
382 	 * and single-sector case.
383 	 *
384 	 *                        Multi-sector Case
385 	 *                (do_tail_read = true if tail > 0)
386 	 *
387 	 *   |<----------------------total_size--------------------->|
388 	 *   |                                                       |
389 	 *   |<--head-->|<--------------bytes------------>|<--tail-->|
390 	 *   |          |                                 |          |
391 	 *   |          |       |<~full_sec_size~>|       |          |
392 	 *   +------------------+                 +------------------+
393 	 *   |          |0101010|     .  .  .     |0101011|          |
394 	 *   +------------------+                 +------------------+
395 	 *         start_sec                         start_sec + n
396 	 *
397 	 *
398 	 *                      Single-sector Case
399 	 *                    (do_tail_read = false)
400 	 *
401 	 *              |<------total_size = secsz----->|
402 	 *              |                               |
403 	 *              |<-head->|<---bytes--->|<-tail->|
404 	 *              +-------------------------------+
405 	 *              |        |0101010101010|        |
406 	 *              +-------------------------------+
407 	 *                          start_sec
408 	 */
409 	start_sec = offset / secsz;
410 	head = offset % secsz;
411 	total_size = roundup2(head + bytes, secsz);
412 	tail = total_size - (head + bytes);
413 	do_tail_read = ((tail > 0) && (head + bytes > secsz));
414 	full_sec_size = total_size;
415 	if (head > 0)
416 		full_sec_size -= secsz;
417 	if (do_tail_read)
418 		full_sec_size -= secsz;
419 
420 	/* Return of partial sector data requires a bounce buffer. */
421 	if ((head > 0) || do_tail_read || bytes < secsz) {
422 		bouncebuf = malloc(secsz);
423 		if (bouncebuf == NULL) {
424 			printf("vdev_read: out of memory\n");
425 			return (ENOMEM);
426 		}
427 	}
428 
429 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
430 		ret = errno;
431 		goto error;
432 	}
433 
434 	/* Partial data return from first sector */
435 	if (head > 0) {
436 		res = read(fd, bouncebuf, secsz);
437 		if (res != secsz) {
438 			ret = EIO;
439 			goto error;
440 		}
441 		memcpy(outbuf, bouncebuf + head, min(secsz - head, bytes));
442 		outbuf += min(secsz - head, bytes);
443 	}
444 
445 	/*
446 	 * Full data return from read sectors.
447 	 * Note, there is still corner case where we read
448 	 * from sector boundary, but less than sector size, e.g. reading 512B
449 	 * from 4k sector.
450 	 */
451 	if (full_sec_size > 0) {
452 		if (bytes < full_sec_size) {
453 			res = read(fd, bouncebuf, secsz);
454 			if (res != secsz) {
455 				ret = EIO;
456 				goto error;
457 			}
458 			memcpy(outbuf, bouncebuf, bytes);
459 		} else {
460 			res = read(fd, outbuf, full_sec_size);
461 			if (res != full_sec_size) {
462 				ret = EIO;
463 				goto error;
464 			}
465 			outbuf += full_sec_size;
466 		}
467 	}
468 
469 	/* Partial data return from last sector */
470 	if (do_tail_read) {
471 		res = read(fd, bouncebuf, secsz);
472 		if (res != secsz) {
473 			ret = EIO;
474 			goto error;
475 		}
476 		memcpy(outbuf, bouncebuf, secsz - tail);
477 	}
478 
479 	ret = 0;
480 error:
481 	free(bouncebuf);
482 	return (ret);
483 }
484 
485 static int
486 vdev_write(vdev_t *vdev __unused, void *priv, off_t offset, void *buf,
487     size_t bytes)
488 {
489 	int fd, ret;
490 	size_t head, tail, total_size, full_sec_size;
491 	unsigned secsz, do_tail_write;
492 	off_t start_sec;
493 	ssize_t res;
494 	char *outbuf, *bouncebuf;
495 
496 	fd = (uintptr_t)priv;
497 	outbuf = (char *) buf;
498 	bouncebuf = NULL;
499 
500 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
501 	if (ret != 0)
502 		return (ret);
503 
504 	start_sec = offset / secsz;
505 	head = offset % secsz;
506 	total_size = roundup2(head + bytes, secsz);
507 	tail = total_size - (head + bytes);
508 	do_tail_write = ((tail > 0) && (head + bytes > secsz));
509 	full_sec_size = total_size;
510 	if (head > 0)
511 		full_sec_size -= secsz;
512 	if (do_tail_write)
513 		full_sec_size -= secsz;
514 
515 	/* Partial sector write requires a bounce buffer. */
516 	if ((head > 0) || do_tail_write || bytes < secsz) {
517 		bouncebuf = malloc(secsz);
518 		if (bouncebuf == NULL) {
519 			printf("vdev_write: out of memory\n");
520 			return (ENOMEM);
521 		}
522 	}
523 
524 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
525 		ret = errno;
526 		goto error;
527 	}
528 
529 	/* Partial data for first sector */
530 	if (head > 0) {
531 		res = read(fd, bouncebuf, secsz);
532 		if (res != secsz) {
533 			ret = EIO;
534 			goto error;
535 		}
536 		memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes));
537 		(void) lseek(fd, -secsz, SEEK_CUR);
538 		res = write(fd, bouncebuf, secsz);
539 		if (res != secsz) {
540 			ret = EIO;
541 			goto error;
542 		}
543 		outbuf += min(secsz - head, bytes);
544 	}
545 
546 	/*
547 	 * Full data write to sectors.
548 	 * Note, there is still corner case where we write
549 	 * to sector boundary, but less than sector size, e.g. write 512B
550 	 * to 4k sector.
551 	 */
552 	if (full_sec_size > 0) {
553 		if (bytes < full_sec_size) {
554 			res = read(fd, bouncebuf, secsz);
555 			if (res != secsz) {
556 				ret = EIO;
557 				goto error;
558 			}
559 			memcpy(bouncebuf, outbuf, bytes);
560 			(void) lseek(fd, -secsz, SEEK_CUR);
561 			res = write(fd, bouncebuf, secsz);
562 			if (res != secsz) {
563 				ret = EIO;
564 				goto error;
565 			}
566 		} else {
567 			res = write(fd, outbuf, full_sec_size);
568 			if (res != full_sec_size) {
569 				ret = EIO;
570 				goto error;
571 			}
572 			outbuf += full_sec_size;
573 		}
574 	}
575 
576 	/* Partial data write to last sector */
577 	if (do_tail_write) {
578 		res = read(fd, bouncebuf, secsz);
579 		if (res != secsz) {
580 			ret = EIO;
581 			goto error;
582 		}
583 		memcpy(bouncebuf, outbuf, secsz - tail);
584 		(void) lseek(fd, -secsz, SEEK_CUR);
585 		res = write(fd, bouncebuf, secsz);
586 		if (res != secsz) {
587 			ret = EIO;
588 			goto error;
589 		}
590 	}
591 
592 	ret = 0;
593 error:
594 	free(bouncebuf);
595 	return (ret);
596 }
597 
598 static void
599 vdev_clear_pad2(vdev_t *vdev)
600 {
601 	vdev_t *kid;
602 	vdev_boot_envblock_t *be;
603 	off_t off = offsetof(vdev_label_t, vl_be);
604 	zio_checksum_info_t *ci;
605 	zio_cksum_t cksum;
606 
607 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
608 		if (kid->v_state != VDEV_STATE_HEALTHY)
609 			continue;
610 		vdev_clear_pad2(kid);
611 	}
612 
613 	if (!STAILQ_EMPTY(&vdev->v_children))
614 		return;
615 
616 	be = calloc(1, sizeof (*be));
617 	if (be == NULL) {
618 		printf("failed to clear be area: out of memory\n");
619 		return;
620 	}
621 
622 	ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
623 	be->vbe_zbt.zec_magic = ZEC_MAGIC;
624 	zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off);
625 	ci->ci_func[0](be, sizeof (*be), NULL, &cksum);
626 	be->vbe_zbt.zec_cksum = cksum;
627 
628 	if (vdev_write(vdev, vdev->v_read_priv, off, be, VDEV_PAD_SIZE)) {
629 		printf("failed to clear be area of primary vdev: %d\n",
630 		    errno);
631 	}
632 	free(be);
633 }
634 
635 /*
636  * Read the next boot command from pad2.
637  * If any instance of pad2 is set to empty string, or the returned string
638  * values are not the same, we consider next boot not to be set.
639  */
640 static char *
641 vdev_read_pad2(vdev_t *vdev)
642 {
643 	vdev_t *kid;
644 	char *tmp, *result = NULL;
645 	vdev_boot_envblock_t *be;
646 	off_t off = offsetof(vdev_label_t, vl_be);
647 
648 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
649 		if (kid->v_state != VDEV_STATE_HEALTHY)
650 			continue;
651 		tmp = vdev_read_pad2(kid);
652 		if (tmp == NULL)
653 			continue;
654 
655 		/* The next boot is not set, we are done. */
656 		if (*tmp == '\0') {
657 			free(result);
658 			return (tmp);
659 		}
660 		if (result == NULL) {
661 			result = tmp;
662 			continue;
663 		}
664 		/* Are the next boot strings different? */
665 		if (strcmp(result, tmp) != 0) {
666 			free(tmp);
667 			*result = '\0';
668 			break;
669 		}
670 		free(tmp);
671 	}
672 	if (result != NULL)
673 		return (result);
674 
675 	be = malloc(sizeof (*be));
676 	if (be == NULL)
677 		return (NULL);
678 
679 	if (vdev_read(vdev, vdev->v_read_priv, off, be, sizeof (*be))) {
680 		return (NULL);
681 	}
682 
683 	switch (be->vbe_version) {
684 	case VB_RAW:
685 	case VB_NVLIST:
686 		result = strdup(be->vbe_bootenv);
687 	default:
688 		/* Backward compatibility with initial nextboot feaure. */
689 		result = strdup((char *)be);
690 	}
691 	return (result);
692 }
693 
694 static int
695 zfs_dev_init(void)
696 {
697 	spa_t *spa;
698 	spa_t *next;
699 	spa_t *prev;
700 
701 	zfs_init();
702 	if (archsw.arch_zfs_probe == NULL)
703 		return (ENXIO);
704 	archsw.arch_zfs_probe();
705 
706 	prev = NULL;
707 	spa = STAILQ_FIRST(&zfs_pools);
708 	while (spa != NULL) {
709 		next = STAILQ_NEXT(spa, spa_link);
710 		if (zfs_spa_init(spa)) {
711 			if (prev == NULL)
712 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
713 			else
714 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
715 		} else
716 			prev = spa;
717 		spa = next;
718 	}
719 	return (0);
720 }
721 
722 struct zfs_probe_args {
723 	int		fd;
724 	const char	*devname;
725 	uint64_t	*pool_guid;
726 	u_int		secsz;
727 };
728 
729 static int
730 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
731 {
732 	struct zfs_probe_args *ppa;
733 
734 	ppa = (struct zfs_probe_args *)arg;
735 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
736 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
737 }
738 
739 static int
740 zfs_probe(int fd, uint64_t *pool_guid)
741 {
742 	spa_t *spa;
743 	int ret;
744 
745 	spa = NULL;
746 	ret = vdev_probe(vdev_read, (void *)(uintptr_t)fd, &spa);
747 	if (ret == 0 && pool_guid != NULL)
748 		*pool_guid = spa->spa_guid;
749 	return (ret);
750 }
751 
752 static int
753 zfs_probe_partition(void *arg, const char *partname,
754     const struct ptable_entry *part)
755 {
756 	struct zfs_probe_args *ppa, pa;
757 	struct ptable *table;
758 	char devname[32];
759 	int ret;
760 
761 	/* Probe only freebsd-zfs and freebsd partitions */
762 	if (part->type != PART_FREEBSD &&
763 	    part->type != PART_FREEBSD_ZFS)
764 		return (0);
765 
766 	ppa = (struct zfs_probe_args *)arg;
767 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
768 	devname[strlen(ppa->devname) - 1] = '\0';
769 	sprintf(devname, "%s%s:", devname, partname);
770 	pa.fd = open(devname, O_RDWR);
771 	if (pa.fd == -1)
772 		return (0);
773 	ret = zfs_probe(pa.fd, ppa->pool_guid);
774 	if (ret == 0)
775 		return (0);
776 	/* Do we have BSD label here? */
777 	if (part->type == PART_FREEBSD) {
778 		pa.devname = devname;
779 		pa.pool_guid = ppa->pool_guid;
780 		pa.secsz = ppa->secsz;
781 		table = ptable_open(&pa, part->end - part->start + 1,
782 		    ppa->secsz, zfs_diskread);
783 		if (table != NULL) {
784 			ptable_iterate(table, &pa, zfs_probe_partition);
785 			ptable_close(table);
786 		}
787 	}
788 	close(pa.fd);
789 	return (0);
790 }
791 
792 int
793 zfs_nextboot(void *vdev, char *buf, size_t size)
794 {
795 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
796 	spa_t *spa;
797 	vdev_t *vd;
798 	char *result = NULL;
799 
800 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
801 		return (1);
802 
803 	if (dev->pool_guid == 0)
804 		spa = STAILQ_FIRST(&zfs_pools);
805 	else
806 		spa = spa_find_by_guid(dev->pool_guid);
807 
808 	if (spa == NULL) {
809 		printf("ZFS: can't find pool by guid\n");
810 	return (1);
811 	}
812 
813 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
814 		char *tmp = vdev_read_pad2(vd);
815 
816 		/* Continue on error. */
817 		if (tmp == NULL)
818 			continue;
819 		/* Nextboot is not set. */
820 		if (*tmp == '\0') {
821 			free(result);
822 			free(tmp);
823 			return (1);
824 		}
825 		if (result == NULL) {
826 			result = tmp;
827 			continue;
828 		}
829 		free(tmp);
830 	}
831 	if (result == NULL)
832 		return (1);
833 
834 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
835 		vdev_clear_pad2(vd);
836 	}
837 
838 	strlcpy(buf, result, size);
839 	free(result);
840 	return (0);
841 }
842 
843 int
844 zfs_probe_dev(const char *devname, uint64_t *pool_guid)
845 {
846 	struct disk_devdesc *dev;
847 	struct ptable *table;
848 	struct zfs_probe_args pa;
849 	uint64_t mediasz;
850 	int ret;
851 
852 	if (pool_guid)
853 		*pool_guid = 0;
854 	pa.fd = open(devname, O_RDWR);
855 	if (pa.fd == -1)
856 		return (ENXIO);
857 	/*
858 	 * We will not probe the whole disk, we can not boot from such
859 	 * disks and some systems will misreport the disk sizes and will
860 	 * hang while accessing the disk.
861 	 */
862 	if (archsw.arch_getdev((void **)&dev, devname, NULL) == 0) {
863 		int partition = dev->d_partition;
864 		int slice = dev->d_slice;
865 
866 		free(dev);
867 		if (partition != D_PARTNONE && slice != D_SLICENONE) {
868 			ret = zfs_probe(pa.fd, pool_guid);
869 			if (ret == 0)
870 				return (0);
871 		}
872 	}
873 
874 	/* Probe each partition */
875 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
876 	if (ret == 0)
877 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
878 	if (ret == 0) {
879 		pa.devname = devname;
880 		pa.pool_guid = pool_guid;
881 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
882 		    zfs_diskread);
883 		if (table != NULL) {
884 			ptable_iterate(table, &pa, zfs_probe_partition);
885 			ptable_close(table);
886 		}
887 	}
888 	close(pa.fd);
889 	if (pool_guid && *pool_guid == 0)
890 		ret = ENXIO;
891 	return (ret);
892 }
893 
894 /*
895  * Print information about ZFS pools
896  */
897 static int
898 zfs_dev_print(int verbose)
899 {
900 	spa_t *spa;
901 	char line[80];
902 	int ret = 0;
903 
904 	if (STAILQ_EMPTY(&zfs_pools))
905 		return (0);
906 
907 	printf("%s devices:", zfs_dev.dv_name);
908 	if ((ret = pager_output("\n")) != 0)
909 		return (ret);
910 
911 	if (verbose) {
912 		return (spa_all_status());
913 	}
914 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
915 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
916 		ret = pager_output(line);
917 		if (ret != 0)
918 			break;
919 	}
920 	return (ret);
921 }
922 
923 /*
924  * Attempt to open the pool described by (dev) for use by (f).
925  */
926 static int
927 zfs_dev_open(struct open_file *f, ...)
928 {
929 	va_list		args;
930 	struct zfs_devdesc	*dev;
931 	struct zfsmount	*mount;
932 	spa_t		*spa;
933 	int		rv;
934 
935 	va_start(args, f);
936 	dev = va_arg(args, struct zfs_devdesc *);
937 	va_end(args);
938 
939 	if (dev->pool_guid == 0)
940 		spa = STAILQ_FIRST(&zfs_pools);
941 	else
942 		spa = spa_find_by_guid(dev->pool_guid);
943 	if (!spa)
944 		return (ENXIO);
945 	mount = malloc(sizeof(*mount));
946 	if (mount == NULL)
947 		rv = ENOMEM;
948 	else
949 		rv = zfs_mount(spa, dev->root_guid, mount);
950 	if (rv != 0) {
951 		free(mount);
952 		return (rv);
953 	}
954 	if (mount->objset.os_type != DMU_OST_ZFS) {
955 		printf("Unexpected object set type %ju\n",
956 		    (uintmax_t)mount->objset.os_type);
957 		free(mount);
958 		return (EIO);
959 	}
960 	f->f_devdata = mount;
961 	free(dev);
962 	return (0);
963 }
964 
965 static int
966 zfs_dev_close(struct open_file *f)
967 {
968 
969 	free(f->f_devdata);
970 	f->f_devdata = NULL;
971 	return (0);
972 }
973 
974 static int
975 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
976 {
977 
978 	return (ENOSYS);
979 }
980 
981 struct devsw zfs_dev = {
982 	.dv_name = "zfs",
983 	.dv_type = DEVT_ZFS,
984 	.dv_init = zfs_dev_init,
985 	.dv_strategy = zfs_dev_strategy,
986 	.dv_open = zfs_dev_open,
987 	.dv_close = zfs_dev_close,
988 	.dv_ioctl = noioctl,
989 	.dv_print = zfs_dev_print,
990 	.dv_cleanup = NULL
991 };
992 
993 int
994 zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path)
995 {
996 	static char	rootname[ZFS_MAXNAMELEN];
997 	static char	poolname[ZFS_MAXNAMELEN];
998 	spa_t		*spa;
999 	const char	*end;
1000 	const char	*np;
1001 	const char	*sep;
1002 	int		rv;
1003 
1004 	np = devspec;
1005 	if (*np != ':')
1006 		return (EINVAL);
1007 	np++;
1008 	end = strrchr(np, ':');
1009 	if (end == NULL)
1010 		return (EINVAL);
1011 	sep = strchr(np, '/');
1012 	if (sep == NULL || sep >= end)
1013 		sep = end;
1014 	memcpy(poolname, np, sep - np);
1015 	poolname[sep - np] = '\0';
1016 	if (sep < end) {
1017 		sep++;
1018 		memcpy(rootname, sep, end - sep);
1019 		rootname[end - sep] = '\0';
1020 	}
1021 	else
1022 		rootname[0] = '\0';
1023 
1024 	spa = spa_find_by_name(poolname);
1025 	if (!spa)
1026 		return (ENXIO);
1027 	dev->pool_guid = spa->spa_guid;
1028 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
1029 	if (rv != 0)
1030 		return (rv);
1031 	if (path != NULL)
1032 		*path = (*end == '\0') ? end : end + 1;
1033 	dev->dd.d_dev = &zfs_dev;
1034 	return (0);
1035 }
1036 
1037 char *
1038 zfs_fmtdev(void *vdev)
1039 {
1040 	static char		rootname[ZFS_MAXNAMELEN];
1041 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
1042 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
1043 	spa_t			*spa;
1044 
1045 	buf[0] = '\0';
1046 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1047 		return (buf);
1048 
1049 	/* Do we have any pools? */
1050 	spa = STAILQ_FIRST(&zfs_pools);
1051 	if (spa == NULL)
1052 		return (buf);
1053 
1054 	if (dev->pool_guid == 0)
1055 		dev->pool_guid = spa->spa_guid;
1056 	else
1057 		spa = spa_find_by_guid(dev->pool_guid);
1058 
1059 	if (spa == NULL) {
1060 		printf("ZFS: can't find pool by guid\n");
1061 		return (buf);
1062 	}
1063 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
1064 		printf("ZFS: can't find root filesystem\n");
1065 		return (buf);
1066 	}
1067 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
1068 		printf("ZFS: can't find filesystem by guid\n");
1069 		return (buf);
1070 	}
1071 
1072 	if (rootname[0] == '\0')
1073 		sprintf(buf, "%s:%s:", dev->dd.d_dev->dv_name, spa->spa_name);
1074 	else
1075 		sprintf(buf, "%s:%s/%s:", dev->dd.d_dev->dv_name, spa->spa_name,
1076 		    rootname);
1077 	return (buf);
1078 }
1079 
1080 int
1081 zfs_list(const char *name)
1082 {
1083 	static char	poolname[ZFS_MAXNAMELEN];
1084 	uint64_t	objid;
1085 	spa_t		*spa;
1086 	const char	*dsname;
1087 	int		len;
1088 	int		rv;
1089 
1090 	len = strlen(name);
1091 	dsname = strchr(name, '/');
1092 	if (dsname != NULL) {
1093 		len = dsname - name;
1094 		dsname++;
1095 	} else
1096 		dsname = "";
1097 	memcpy(poolname, name, len);
1098 	poolname[len] = '\0';
1099 
1100 	spa = spa_find_by_name(poolname);
1101 	if (!spa)
1102 		return (ENXIO);
1103 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1104 	if (rv != 0)
1105 		return (rv);
1106 
1107 	return (zfs_list_dataset(spa, objid));
1108 }
1109 
1110 void
1111 init_zfs_bootenv(const char *currdev_in)
1112 {
1113 	char *beroot, *currdev;
1114 	int currdev_len;
1115 
1116 	currdev = NULL;
1117 	currdev_len = strlen(currdev_in);
1118 	if (currdev_len == 0)
1119 		return;
1120 	if (strncmp(currdev_in, "zfs:", 4) != 0)
1121 		return;
1122 	currdev = strdup(currdev_in);
1123 	if (currdev == NULL)
1124 		return;
1125 	/* Remove the trailing : */
1126 	currdev[currdev_len - 1] = '\0';
1127 	setenv("zfs_be_active", currdev, 1);
1128 	setenv("zfs_be_currpage", "1", 1);
1129 	/* Remove the last element (current bootenv) */
1130 	beroot = strrchr(currdev, '/');
1131 	if (beroot != NULL)
1132 		beroot[0] = '\0';
1133 	beroot = strchr(currdev, ':') + 1;
1134 	setenv("zfs_be_root", beroot, 1);
1135 	zfs_bootenv_initial(beroot);
1136 	free(currdev);
1137 }
1138 
1139 static void
1140 zfs_bootenv_initial(const char *name)
1141 {
1142 	char		poolname[ZFS_MAXNAMELEN], *dsname;
1143 	char envname[32], envval[256];
1144 	uint64_t	objid;
1145 	spa_t		*spa;
1146 	int		bootenvs_idx, len, rv;
1147 
1148 	SLIST_INIT(&zfs_be_head);
1149 	zfs_env_count = 0;
1150 	len = strlen(name);
1151 	dsname = strchr(name, '/');
1152 	if (dsname != NULL) {
1153 		len = dsname - name;
1154 		dsname++;
1155 	} else
1156 		dsname = "";
1157 	strlcpy(poolname, name, len + 1);
1158 	spa = spa_find_by_name(poolname);
1159 	if (spa == NULL)
1160 		return;
1161 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1162 	if (rv != 0)
1163 		return;
1164 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1165 	bootenvs_idx = 0;
1166 	/* Populate the initial environment variables */
1167 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1168 		/* Enumerate all bootenvs for general usage */
1169 		snprintf(envname, sizeof(envname), "bootenvs[%d]", bootenvs_idx);
1170 		snprintf(envval, sizeof(envval), "zfs:%s/%s", name, zfs_be->name);
1171 		rv = setenv(envname, envval, 1);
1172 		if (rv != 0)
1173 			break;
1174 		bootenvs_idx++;
1175 	}
1176 	snprintf(envval, sizeof(envval), "%d", bootenvs_idx);
1177 	setenv("bootenvs_count", envval, 1);
1178 
1179 	/* Clean up the SLIST of ZFS BEs */
1180 	while (!SLIST_EMPTY(&zfs_be_head)) {
1181 		zfs_be = SLIST_FIRST(&zfs_be_head);
1182 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1183 		free(zfs_be->name);
1184 		free(zfs_be);
1185 	}
1186 
1187 	return;
1188 
1189 }
1190 
1191 int
1192 zfs_bootenv(const char *name)
1193 {
1194 	static char	poolname[ZFS_MAXNAMELEN], *dsname, *root;
1195 	char		becount[4];
1196 	uint64_t	objid;
1197 	spa_t		*spa;
1198 	int		len, rv, pages, perpage, currpage;
1199 
1200 	if (name == NULL)
1201 		return (EINVAL);
1202 	if ((root = getenv("zfs_be_root")) == NULL)
1203 		return (EINVAL);
1204 
1205 	if (strcmp(name, root) != 0) {
1206 		if (setenv("zfs_be_root", name, 1) != 0)
1207 			return (ENOMEM);
1208 	}
1209 
1210 	SLIST_INIT(&zfs_be_head);
1211 	zfs_env_count = 0;
1212 	len = strlen(name);
1213 	dsname = strchr(name, '/');
1214 	if (dsname != NULL) {
1215 		len = dsname - name;
1216 		dsname++;
1217 	} else
1218 		dsname = "";
1219 	memcpy(poolname, name, len);
1220 	poolname[len] = '\0';
1221 
1222 	spa = spa_find_by_name(poolname);
1223 	if (!spa)
1224 		return (ENXIO);
1225 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1226 	if (rv != 0)
1227 		return (rv);
1228 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1229 
1230 	/* Calculate and store the number of pages of BEs */
1231 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
1232 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
1233 	snprintf(becount, 4, "%d", pages);
1234 	if (setenv("zfs_be_pages", becount, 1) != 0)
1235 		return (ENOMEM);
1236 
1237 	/* Roll over the page counter if it has exceeded the maximum */
1238 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
1239 	if (currpage > pages) {
1240 		if (setenv("zfs_be_currpage", "1", 1) != 0)
1241 			return (ENOMEM);
1242 	}
1243 
1244 	/* Populate the menu environment variables */
1245 	zfs_set_env();
1246 
1247 	/* Clean up the SLIST of ZFS BEs */
1248 	while (!SLIST_EMPTY(&zfs_be_head)) {
1249 		zfs_be = SLIST_FIRST(&zfs_be_head);
1250 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1251 		free(zfs_be->name);
1252 		free(zfs_be);
1253 	}
1254 
1255 	return (rv);
1256 }
1257 
1258 int
1259 zfs_belist_add(const char *name, uint64_t value __unused)
1260 {
1261 
1262 	/* Skip special datasets that start with a $ character */
1263 	if (strncmp(name, "$", 1) == 0) {
1264 		return (0);
1265 	}
1266 	/* Add the boot environment to the head of the SLIST */
1267 	zfs_be = malloc(sizeof(struct zfs_be_entry));
1268 	if (zfs_be == NULL) {
1269 		return (ENOMEM);
1270 	}
1271 	zfs_be->name = strdup(name);
1272 	if (zfs_be->name == NULL) {
1273 		free(zfs_be);
1274 		return (ENOMEM);
1275 	}
1276 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
1277 	zfs_env_count++;
1278 
1279 	return (0);
1280 }
1281 
1282 int
1283 zfs_set_env(void)
1284 {
1285 	char envname[32], envval[256];
1286 	char *beroot, *pagenum;
1287 	int rv, page, ctr;
1288 
1289 	beroot = getenv("zfs_be_root");
1290 	if (beroot == NULL) {
1291 		return (1);
1292 	}
1293 
1294 	pagenum = getenv("zfs_be_currpage");
1295 	if (pagenum != NULL) {
1296 		page = strtol(pagenum, NULL, 10);
1297 	} else {
1298 		page = 1;
1299 	}
1300 
1301 	ctr = 1;
1302 	rv = 0;
1303 	zfs_env_index = ZFS_BE_FIRST;
1304 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1305 		/* Skip to the requested page number */
1306 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
1307 			ctr++;
1308 			continue;
1309 		}
1310 
1311 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1312 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
1313 		rv = setenv(envname, envval, 1);
1314 		if (rv != 0) {
1315 			break;
1316 		}
1317 
1318 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
1319 		rv = setenv(envname, envval, 1);
1320 		if (rv != 0){
1321 			break;
1322 		}
1323 
1324 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
1325 		rv = setenv(envname, "set_bootenv", 1);
1326 		if (rv != 0){
1327 			break;
1328 		}
1329 
1330 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
1331 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
1332 		rv = setenv(envname, envval, 1);
1333 		if (rv != 0){
1334 			break;
1335 		}
1336 
1337 		zfs_env_index++;
1338 		if (zfs_env_index > ZFS_BE_LAST) {
1339 			break;
1340 		}
1341 
1342 	}
1343 
1344 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
1345 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1346 		(void)unsetenv(envname);
1347 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
1348 		(void)unsetenv(envname);
1349 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
1350 		(void)unsetenv(envname);
1351 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
1352 		(void)unsetenv(envname);
1353 	}
1354 
1355 	return (rv);
1356 }
1357