1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2014, Delphix. All rights reserved.
24  * Copyright (c) 2021, George Amanakis. All rights reserved.
25  */
26 
27 /*
28  * Routines to manage the on-disk persistent error log.
29  *
30  * Each pool stores a log of all logical data errors seen during normal
31  * operation.  This is actually the union of two distinct logs: the last log,
32  * and the current log.  All errors seen are logged to the current log.  When a
33  * scrub completes, the current log becomes the last log, the last log is thrown
34  * out, and the current log is reinitialized.  This way, if an error is somehow
35  * corrected, a new scrub will show that it no longer exists, and will be
36  * deleted from the log when the scrub completes.
37  *
38  * The log is stored using a ZAP object whose key is a string form of the
39  * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
40  * optional 'objset:object' human-readable string describing the data.  When an
41  * error is first logged, this string will be empty, indicating that no name is
42  * known.  This prevents us from having to issue a potentially large amount of
43  * I/O to discover the object name during an error path.  Instead, we do the
44  * calculation when the data is requested, storing the result so future queries
45  * will be faster.
46  *
47  * If the head_errlog feature is enabled, a different on-disk format is used.
48  * The error log of each head dataset is stored separately in the zap object
49  * and keyed by the head id. This enables listing every dataset affected in
50  * userland. In order to be able to track whether an error block has been
51  * modified or added to snapshots since it was marked as an error, a new tuple
52  * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
53  * transaction group of an error block on-disk. The birth transaction group is
54  * used by check_filesystem() to assess whether this block was freed,
55  * re-written or added to a snapshot since its marking as an error.
56  *
57  * This log is then shipped into an nvlist where the key is the dataset name and
58  * the value is the object name.  Userland is then responsible for uniquifying
59  * this list and displaying it to the user.
60  */
61 
62 #include <sys/dmu_tx.h>
63 #include <sys/spa.h>
64 #include <sys/spa_impl.h>
65 #include <sys/zap.h>
66 #include <sys/zio.h>
67 #include <sys/dsl_dir.h>
68 #include <sys/dmu_objset.h>
69 #include <sys/dbuf.h>
70 
71 /*
72  * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
73  * 		of on-disk error log entries that will be converted to the new
74  * 		format when enabling head_errlog. Defaults to 0 which converts
75  * 		all log entries.
76  */
77 static uint32_t spa_upgrade_errlog_limit = 0;
78 
79 /*
80  * Convert a bookmark to a string.
81  */
82 static void
83 bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
84 {
85 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
86 	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
87 	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
88 }
89 
90 /*
91  * Convert an err_phys to a string.
92  */
93 static void
94 errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
95 {
96 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
97 	    (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
98 	    (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
99 }
100 
101 /*
102  * Convert a string to a err_phys.
103  */
104 static void
105 name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
106 {
107 	zep->zb_object = zfs_strtonum(buf, &buf);
108 	ASSERT(*buf == ':');
109 	zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
110 	ASSERT(*buf == ':');
111 	zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
112 	ASSERT(*buf == ':');
113 	zep->zb_birth = zfs_strtonum(buf + 1, &buf);
114 	ASSERT(*buf == '\0');
115 }
116 
117 /*
118  * Convert a string to a bookmark.
119  */
120 static void
121 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
122 {
123 	zb->zb_objset = zfs_strtonum(buf, &buf);
124 	ASSERT(*buf == ':');
125 	zb->zb_object = zfs_strtonum(buf + 1, &buf);
126 	ASSERT(*buf == ':');
127 	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
128 	ASSERT(*buf == ':');
129 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
130 	ASSERT(*buf == '\0');
131 }
132 
133 #ifdef _KERNEL
134 static void
135 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
136 {
137 	zb->zb_objset = dataset;
138 	zb->zb_object = zep->zb_object;
139 	zb->zb_level = zep->zb_level;
140 	zb->zb_blkid = zep->zb_blkid;
141 }
142 #endif
143 
144 static void
145 name_to_object(char *buf, uint64_t *obj)
146 {
147 	*obj = zfs_strtonum(buf, &buf);
148 	ASSERT(*buf == '\0');
149 }
150 
151 static int
152 get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
153     uint64_t *head_dataset_id)
154 {
155 	dsl_pool_t *dp = spa->spa_dsl_pool;
156 	dsl_dataset_t *ds;
157 	objset_t *os;
158 
159 	dsl_pool_config_enter(dp, FTAG);
160 	int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
161 	if (error != 0) {
162 		dsl_pool_config_exit(dp, FTAG);
163 		return (error);
164 	}
165 	ASSERT(head_dataset_id);
166 	*head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
167 
168 	error = dmu_objset_from_ds(ds, &os);
169 	if (error != 0) {
170 		dsl_dataset_rele(ds, FTAG);
171 		dsl_pool_config_exit(dp, FTAG);
172 		return (error);
173 	}
174 
175 	dnode_t *dn;
176 	blkptr_t bp;
177 
178 	error = dnode_hold(os, zep->zb_object, FTAG, &dn);
179 	if (error != 0) {
180 		dsl_dataset_rele(ds, FTAG);
181 		dsl_pool_config_exit(dp, FTAG);
182 		return (error);
183 	}
184 
185 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
186 	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
187 	    NULL);
188 
189 	if (error == 0 && BP_IS_HOLE(&bp))
190 		error = SET_ERROR(ENOENT);
191 
192 	zep->zb_birth = bp.blk_birth;
193 	rw_exit(&dn->dn_struct_rwlock);
194 	dnode_rele(dn, FTAG);
195 	dsl_dataset_rele(ds, FTAG);
196 	dsl_pool_config_exit(dp, FTAG);
197 	return (error);
198 }
199 
200 /*
201  * Log an uncorrectable error to the persistent error log.  We add it to the
202  * spa's list of pending errors.  The changes are actually synced out to disk
203  * during spa_errlog_sync().
204  */
205 void
206 spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
207 {
208 	spa_error_entry_t search;
209 	spa_error_entry_t *new;
210 	avl_tree_t *tree;
211 	avl_index_t where;
212 
213 	/*
214 	 * If we are trying to import a pool, ignore any errors, as we won't be
215 	 * writing to the pool any time soon.
216 	 */
217 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
218 		return;
219 
220 	mutex_enter(&spa->spa_errlist_lock);
221 
222 	/*
223 	 * If we have had a request to rotate the log, log it to the next list
224 	 * instead of the current one.
225 	 */
226 	if (spa->spa_scrub_active || spa->spa_scrub_finished)
227 		tree = &spa->spa_errlist_scrub;
228 	else
229 		tree = &spa->spa_errlist_last;
230 
231 	search.se_bookmark = *zb;
232 	if (avl_find(tree, &search, &where) != NULL) {
233 		mutex_exit(&spa->spa_errlist_lock);
234 		return;
235 	}
236 
237 	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
238 	new->se_bookmark = *zb;
239 	avl_insert(tree, new, where);
240 
241 	mutex_exit(&spa->spa_errlist_lock);
242 }
243 
244 #ifdef _KERNEL
245 static int
246 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
247     uint64_t *birth_txg)
248 {
249 	objset_t *os;
250 	int error = dmu_objset_from_ds(ds, &os);
251 	if (error != 0)
252 		return (error);
253 
254 	dnode_t *dn;
255 	blkptr_t bp;
256 
257 	error = dnode_hold(os, zep->zb_object, FTAG, &dn);
258 	if (error != 0)
259 		return (error);
260 
261 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
262 	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
263 	    NULL);
264 
265 	if (error == 0 && BP_IS_HOLE(&bp))
266 		error = SET_ERROR(ENOENT);
267 
268 	*birth_txg = bp.blk_birth;
269 	rw_exit(&dn->dn_struct_rwlock);
270 	dnode_rele(dn, FTAG);
271 	return (error);
272 }
273 
274 /*
275  * This function serves a double role. If only_count is true, it returns
276  * (in *count) how many times an error block belonging to this filesystem is
277  * referenced by snapshots or clones. If only_count is false, each time the
278  * error block is referenced by a snapshot or clone, it fills the userspace
279  * array at uaddr with the bookmarks of the error blocks. The array is filled
280  * from the back and *count is modified to be the number of unused entries at
281  * the beginning of the array.
282  */
283 static int
284 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
285     uint64_t *count, void *uaddr, boolean_t only_count)
286 {
287 	dsl_dataset_t *ds;
288 	dsl_pool_t *dp = spa->spa_dsl_pool;
289 
290 	int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
291 	if (error != 0)
292 		return (error);
293 
294 	uint64_t latest_txg;
295 	uint64_t txg_to_consider = spa->spa_syncing_txg;
296 	boolean_t check_snapshot = B_TRUE;
297 	error = find_birth_txg(ds, zep, &latest_txg);
298 	if (error == 0) {
299 		if (zep->zb_birth == latest_txg) {
300 			/* Block neither free nor rewritten. */
301 			if (!only_count) {
302 				zbookmark_phys_t zb;
303 				zep_to_zb(head_ds, zep, &zb);
304 				if (copyout(&zb, (char *)uaddr + (*count - 1)
305 				    * sizeof (zbookmark_phys_t),
306 				    sizeof (zbookmark_phys_t)) != 0) {
307 					dsl_dataset_rele(ds, FTAG);
308 					return (SET_ERROR(EFAULT));
309 				}
310 				(*count)--;
311 			} else {
312 				(*count)++;
313 			}
314 			check_snapshot = B_FALSE;
315 		} else {
316 			ASSERT3U(zep->zb_birth, <, latest_txg);
317 			txg_to_consider = latest_txg;
318 		}
319 	}
320 
321 	/* How many snapshots reference this block. */
322 	uint64_t snap_count;
323 	error = zap_count(spa->spa_meta_objset,
324 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
325 	if (error != 0) {
326 		dsl_dataset_rele(ds, FTAG);
327 		return (error);
328 	}
329 
330 	if (snap_count == 0) {
331 		/* File system has no snapshot. */
332 		dsl_dataset_rele(ds, FTAG);
333 		return (0);
334 	}
335 
336 	uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t),
337 	    KM_SLEEP);
338 
339 	int aff_snap_count = 0;
340 	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
341 	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
342 
343 	/* Check only snapshots created from this file system. */
344 	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
345 	    snap_obj_txg <= txg_to_consider) {
346 
347 		dsl_dataset_rele(ds, FTAG);
348 		error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
349 		if (error != 0)
350 			goto out;
351 
352 		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds)
353 			break;
354 
355 		boolean_t affected = B_TRUE;
356 		if (check_snapshot) {
357 			uint64_t blk_txg;
358 			error = find_birth_txg(ds, zep, &blk_txg);
359 			affected = (error == 0 && zep->zb_birth == blk_txg);
360 		}
361 
362 		if (affected) {
363 			snap_obj_array[aff_snap_count] = snap_obj;
364 			aff_snap_count++;
365 
366 			if (!only_count) {
367 				zbookmark_phys_t zb;
368 				zep_to_zb(snap_obj, zep, &zb);
369 				if (copyout(&zb, (char *)uaddr + (*count - 1) *
370 				    sizeof (zbookmark_phys_t),
371 				    sizeof (zbookmark_phys_t)) != 0) {
372 					dsl_dataset_rele(ds, FTAG);
373 					error = SET_ERROR(EFAULT);
374 					goto out;
375 				}
376 				(*count)--;
377 			} else {
378 				(*count)++;
379 			}
380 
381 			/*
382 			 * Only clones whose origins were affected could also
383 			 * have affected snapshots.
384 			 */
385 			zap_cursor_t zc;
386 			zap_attribute_t za;
387 			for (zap_cursor_init(&zc, spa->spa_meta_objset,
388 			    dsl_dataset_phys(ds)->ds_next_clones_obj);
389 			    zap_cursor_retrieve(&zc, &za) == 0;
390 			    zap_cursor_advance(&zc)) {
391 				error = check_filesystem(spa,
392 				    za.za_first_integer, zep,
393 				    count, uaddr, only_count);
394 
395 				if (error != 0) {
396 					zap_cursor_fini(&zc);
397 					goto out;
398 				}
399 			}
400 			zap_cursor_fini(&zc);
401 		}
402 		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
403 		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
404 	}
405 	dsl_dataset_rele(ds, FTAG);
406 
407 out:
408 	kmem_free(snap_obj_array, sizeof (*snap_obj_array));
409 	return (error);
410 }
411 
412 static int
413 find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
414     uint64_t *top_affected_fs)
415 {
416 	uint64_t oldest_dsobj;
417 	int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
418 	    &oldest_dsobj);
419 	if (error != 0)
420 		return (error);
421 
422 	dsl_dataset_t *ds;
423 	error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj,
424 	    FTAG, &ds);
425 	if (error != 0)
426 		return (error);
427 
428 	*top_affected_fs =
429 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
430 	dsl_dataset_rele(ds, FTAG);
431 	return (0);
432 }
433 
434 static int
435 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
436     uint64_t *count, void *uaddr, boolean_t only_count)
437 {
438 	dsl_pool_t *dp = spa->spa_dsl_pool;
439 	dsl_pool_config_enter(dp, FTAG);
440 	uint64_t top_affected_fs;
441 
442 	int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
443 	if (error == 0)
444 		error = check_filesystem(spa, top_affected_fs, zep, count,
445 		    uaddr, only_count);
446 
447 	dsl_pool_config_exit(dp, FTAG);
448 	return (error);
449 }
450 
451 static uint64_t
452 get_errlog_size(spa_t *spa, uint64_t spa_err_obj)
453 {
454 	if (spa_err_obj == 0)
455 		return (0);
456 	uint64_t total = 0;
457 
458 	zap_cursor_t zc;
459 	zap_attribute_t za;
460 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
461 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
462 
463 		zap_cursor_t head_ds_cursor;
464 		zap_attribute_t head_ds_attr;
465 		zbookmark_err_phys_t head_ds_block;
466 
467 		uint64_t head_ds;
468 		name_to_object(za.za_name, &head_ds);
469 
470 		for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
471 		    za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor,
472 		    &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
473 
474 			name_to_errphys(head_ds_attr.za_name, &head_ds_block);
475 			(void) process_error_block(spa, head_ds, &head_ds_block,
476 			    &total, NULL, B_TRUE);
477 		}
478 		zap_cursor_fini(&head_ds_cursor);
479 	}
480 	zap_cursor_fini(&zc);
481 	return (total);
482 }
483 
484 static uint64_t
485 get_errlist_size(spa_t *spa, avl_tree_t *tree)
486 {
487 	if (avl_numnodes(tree) == 0)
488 		return (0);
489 	uint64_t total = 0;
490 
491 	spa_error_entry_t *se;
492 	for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) {
493 		zbookmark_err_phys_t zep;
494 		zep.zb_object = se->se_bookmark.zb_object;
495 		zep.zb_level = se->se_bookmark.zb_level;
496 		zep.zb_blkid = se->se_bookmark.zb_blkid;
497 
498 		/*
499 		 * If we cannot find out the head dataset and birth txg of
500 		 * the present error block, we opt not to error out. In the
501 		 * next pool sync this information will be retrieved by
502 		 * sync_error_list() and written to the on-disk error log.
503 		 */
504 		uint64_t head_ds_obj;
505 		if (get_head_and_birth_txg(spa, &zep,
506 		    se->se_bookmark.zb_objset, &head_ds_obj) == 0)
507 			(void) process_error_block(spa, head_ds_obj, &zep,
508 			    &total, NULL, B_TRUE);
509 	}
510 	return (total);
511 }
512 #endif
513 
514 /*
515  * Return the number of errors currently in the error log.  This is actually the
516  * sum of both the last log and the current log, since we don't know the union
517  * of these logs until we reach userland.
518  */
519 uint64_t
520 spa_get_errlog_size(spa_t *spa)
521 {
522 	uint64_t total = 0;
523 
524 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
525 		mutex_enter(&spa->spa_errlog_lock);
526 		uint64_t count;
527 		if (spa->spa_errlog_scrub != 0 &&
528 		    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
529 		    &count) == 0)
530 			total += count;
531 
532 		if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
533 		    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
534 		    &count) == 0)
535 			total += count;
536 		mutex_exit(&spa->spa_errlog_lock);
537 
538 		mutex_enter(&spa->spa_errlist_lock);
539 		total += avl_numnodes(&spa->spa_errlist_last);
540 		total += avl_numnodes(&spa->spa_errlist_scrub);
541 		mutex_exit(&spa->spa_errlist_lock);
542 	} else {
543 #ifdef _KERNEL
544 		mutex_enter(&spa->spa_errlog_lock);
545 		total += get_errlog_size(spa, spa->spa_errlog_last);
546 		total += get_errlog_size(spa, spa->spa_errlog_scrub);
547 		mutex_exit(&spa->spa_errlog_lock);
548 
549 		mutex_enter(&spa->spa_errlist_lock);
550 		total += get_errlist_size(spa, &spa->spa_errlist_last);
551 		total += get_errlist_size(spa, &spa->spa_errlist_scrub);
552 		mutex_exit(&spa->spa_errlist_lock);
553 #endif
554 	}
555 	return (total);
556 }
557 
558 /*
559  * This function sweeps through an on-disk error log and stores all bookmarks
560  * as error bookmarks in a new ZAP object. At the end we discard the old one,
561  * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
562  * object.
563  */
564 static void
565 sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
566     dmu_tx_t *tx)
567 {
568 	zap_cursor_t zc;
569 	zap_attribute_t za;
570 	zbookmark_phys_t zb;
571 	uint64_t count;
572 
573 	*newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
574 	    DMU_OT_NONE, 0, tx);
575 
576 	/*
577 	 * If we cannnot perform the upgrade we should clear the old on-disk
578 	 * error logs.
579 	 */
580 	if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
581 		VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
582 		return;
583 	}
584 
585 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
586 	    zap_cursor_retrieve(&zc, &za) == 0;
587 	    zap_cursor_advance(&zc)) {
588 		if (spa_upgrade_errlog_limit != 0 &&
589 		    zc.zc_cd == spa_upgrade_errlog_limit)
590 			break;
591 
592 		name_to_bookmark(za.za_name, &zb);
593 
594 		zbookmark_err_phys_t zep;
595 		zep.zb_object = zb.zb_object;
596 		zep.zb_level = zb.zb_level;
597 		zep.zb_blkid = zb.zb_blkid;
598 
599 		/*
600 		 * We cannot use get_head_and_birth_txg() because it will
601 		 * acquire the pool config lock, which we already have. In case
602 		 * of an error we simply continue.
603 		 */
604 		uint64_t head_dataset_obj;
605 		dsl_pool_t *dp = spa->spa_dsl_pool;
606 		dsl_dataset_t *ds;
607 		objset_t *os;
608 
609 		int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds);
610 		if (error != 0)
611 			continue;
612 
613 		head_dataset_obj =
614 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
615 
616 		/*
617 		 * The objset and the dnode are required for getting the block
618 		 * pointer, which is used to determine if BP_IS_HOLE(). If
619 		 * getting the objset or the dnode fails, do not create a
620 		 * zap entry (presuming we know the dataset) as this may create
621 		 * spurious errors that we cannot ever resolve. If an error is
622 		 * truly persistent, it should re-appear after a scan.
623 		 */
624 		if (dmu_objset_from_ds(ds, &os) != 0) {
625 			dsl_dataset_rele(ds, FTAG);
626 			continue;
627 		}
628 
629 		dnode_t *dn;
630 		blkptr_t bp;
631 
632 		if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
633 			dsl_dataset_rele(ds, FTAG);
634 			continue;
635 		}
636 
637 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
638 		error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
639 		    NULL, NULL);
640 
641 		zep.zb_birth = bp.blk_birth;
642 		rw_exit(&dn->dn_struct_rwlock);
643 		dnode_rele(dn, FTAG);
644 		dsl_dataset_rele(ds, FTAG);
645 
646 		if (error != 0 || BP_IS_HOLE(&bp))
647 			continue;
648 
649 		uint64_t err_obj;
650 		error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
651 		    head_dataset_obj, &err_obj);
652 
653 		if (error == ENOENT) {
654 			err_obj = zap_create(spa->spa_meta_objset,
655 			    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
656 
657 			(void) zap_update_int_key(spa->spa_meta_objset,
658 			    *newobj, head_dataset_obj, err_obj, tx);
659 		}
660 
661 		char buf[64];
662 		char *name = "";
663 		errphys_to_name(&zep, buf, sizeof (buf));
664 
665 		(void) zap_update(spa->spa_meta_objset, err_obj,
666 		    buf, 1, strlen(name) + 1, name, tx);
667 	}
668 	zap_cursor_fini(&zc);
669 
670 	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
671 }
672 
673 void
674 spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
675 {
676 	uint64_t newobj = 0;
677 
678 	mutex_enter(&spa->spa_errlog_lock);
679 	if (spa->spa_errlog_last != 0) {
680 		sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
681 		spa->spa_errlog_last = newobj;
682 	}
683 
684 	if (spa->spa_errlog_scrub != 0) {
685 		sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
686 		spa->spa_errlog_scrub = newobj;
687 	}
688 	mutex_exit(&spa->spa_errlog_lock);
689 }
690 
691 #ifdef _KERNEL
692 /*
693  * If an error block is shared by two datasets it will be counted twice. For
694  * detailed message see spa_get_errlog_size() above.
695  */
696 static int
697 process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
698 {
699 	zap_cursor_t zc;
700 	zap_attribute_t za;
701 
702 	if (obj == 0)
703 		return (0);
704 
705 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
706 		for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
707 		    zap_cursor_retrieve(&zc, &za) == 0;
708 		    zap_cursor_advance(&zc)) {
709 			if (*count == 0) {
710 				zap_cursor_fini(&zc);
711 				return (SET_ERROR(ENOMEM));
712 			}
713 
714 			zbookmark_phys_t zb;
715 			name_to_bookmark(za.za_name, &zb);
716 
717 			if (copyout(&zb, (char *)uaddr +
718 			    (*count - 1) * sizeof (zbookmark_phys_t),
719 			    sizeof (zbookmark_phys_t)) != 0) {
720 				zap_cursor_fini(&zc);
721 				return (SET_ERROR(EFAULT));
722 			}
723 			*count -= 1;
724 
725 		}
726 		zap_cursor_fini(&zc);
727 		return (0);
728 	}
729 
730 	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
731 	    zap_cursor_retrieve(&zc, &za) == 0;
732 	    zap_cursor_advance(&zc)) {
733 
734 		zap_cursor_t head_ds_cursor;
735 		zap_attribute_t head_ds_attr;
736 
737 		uint64_t head_ds_err_obj = za.za_first_integer;
738 		uint64_t head_ds;
739 		name_to_object(za.za_name, &head_ds);
740 		for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
741 		    head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor,
742 		    &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
743 
744 			zbookmark_err_phys_t head_ds_block;
745 			name_to_errphys(head_ds_attr.za_name, &head_ds_block);
746 			int error = process_error_block(spa, head_ds,
747 			    &head_ds_block, count, uaddr, B_FALSE);
748 
749 			if (error != 0) {
750 				zap_cursor_fini(&head_ds_cursor);
751 				zap_cursor_fini(&zc);
752 				return (error);
753 			}
754 		}
755 		zap_cursor_fini(&head_ds_cursor);
756 	}
757 	zap_cursor_fini(&zc);
758 	return (0);
759 }
760 
761 static int
762 process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
763 {
764 	spa_error_entry_t *se;
765 
766 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
767 		for (se = avl_first(list); se != NULL;
768 		    se = AVL_NEXT(list, se)) {
769 
770 			if (*count == 0)
771 				return (SET_ERROR(ENOMEM));
772 
773 			if (copyout(&se->se_bookmark, (char *)uaddr +
774 			    (*count - 1) * sizeof (zbookmark_phys_t),
775 			    sizeof (zbookmark_phys_t)) != 0)
776 				return (SET_ERROR(EFAULT));
777 
778 			*count -= 1;
779 		}
780 		return (0);
781 	}
782 
783 	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
784 		zbookmark_err_phys_t zep;
785 		zep.zb_object = se->se_bookmark.zb_object;
786 		zep.zb_level = se->se_bookmark.zb_level;
787 		zep.zb_blkid = se->se_bookmark.zb_blkid;
788 
789 		uint64_t head_ds_obj;
790 		int error = get_head_and_birth_txg(spa, &zep,
791 		    se->se_bookmark.zb_objset, &head_ds_obj);
792 		if (error != 0)
793 			return (error);
794 
795 		error = process_error_block(spa, head_ds_obj, &zep, count,
796 		    uaddr, B_FALSE);
797 		if (error != 0)
798 			return (error);
799 	}
800 	return (0);
801 }
802 #endif
803 
804 /*
805  * Copy all known errors to userland as an array of bookmarks.  This is
806  * actually a union of the on-disk last log and current log, as well as any
807  * pending error requests.
808  *
809  * Because the act of reading the on-disk log could cause errors to be
810  * generated, we have two separate locks: one for the error log and one for the
811  * in-core error lists.  We only need the error list lock to log and error, so
812  * we grab the error log lock while we read the on-disk logs, and only pick up
813  * the error list lock when we are finished.
814  */
815 int
816 spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
817 {
818 	int ret = 0;
819 
820 #ifdef _KERNEL
821 	mutex_enter(&spa->spa_errlog_lock);
822 
823 	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
824 
825 	if (!ret && !spa->spa_scrub_finished)
826 		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
827 		    count);
828 
829 	mutex_enter(&spa->spa_errlist_lock);
830 	if (!ret)
831 		ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
832 		    count);
833 	if (!ret)
834 		ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
835 		    count);
836 	mutex_exit(&spa->spa_errlist_lock);
837 
838 	mutex_exit(&spa->spa_errlog_lock);
839 #else
840 	(void) spa, (void) uaddr, (void) count;
841 #endif
842 
843 	return (ret);
844 }
845 
846 /*
847  * Called when a scrub completes.  This simply set a bit which tells which AVL
848  * tree to add new errors.  spa_errlog_sync() is responsible for actually
849  * syncing the changes to the underlying objects.
850  */
851 void
852 spa_errlog_rotate(spa_t *spa)
853 {
854 	mutex_enter(&spa->spa_errlist_lock);
855 	spa->spa_scrub_finished = B_TRUE;
856 	mutex_exit(&spa->spa_errlist_lock);
857 }
858 
859 /*
860  * Discard any pending errors from the spa_t.  Called when unloading a faulted
861  * pool, as the errors encountered during the open cannot be synced to disk.
862  */
863 void
864 spa_errlog_drain(spa_t *spa)
865 {
866 	spa_error_entry_t *se;
867 	void *cookie;
868 
869 	mutex_enter(&spa->spa_errlist_lock);
870 
871 	cookie = NULL;
872 	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
873 	    &cookie)) != NULL)
874 		kmem_free(se, sizeof (spa_error_entry_t));
875 	cookie = NULL;
876 	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
877 	    &cookie)) != NULL)
878 		kmem_free(se, sizeof (spa_error_entry_t));
879 
880 	mutex_exit(&spa->spa_errlist_lock);
881 }
882 
883 /*
884  * Process a list of errors into the current on-disk log.
885  */
886 void
887 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
888 {
889 	spa_error_entry_t *se;
890 	char buf[64];
891 	void *cookie;
892 
893 	if (avl_numnodes(t) == 0)
894 		return;
895 
896 	/* create log if necessary */
897 	if (*obj == 0)
898 		*obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
899 		    DMU_OT_NONE, 0, tx);
900 
901 	/* add errors to the current log */
902 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
903 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
904 			char *name = se->se_name ? se->se_name : "";
905 
906 			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
907 
908 			(void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
909 			    strlen(name) + 1, name, tx);
910 		}
911 	} else {
912 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
913 			char *name = se->se_name ? se->se_name : "";
914 
915 			zbookmark_err_phys_t zep;
916 			zep.zb_object = se->se_bookmark.zb_object;
917 			zep.zb_level = se->se_bookmark.zb_level;
918 			zep.zb_blkid = se->se_bookmark.zb_blkid;
919 
920 			/*
921 			 * If we cannot find out the head dataset and birth txg
922 			 * of the present error block, we simply continue.
923 			 * Reinserting that error block to the error lists,
924 			 * even if we are not syncing the final txg, results
925 			 * in duplicate posting of errors.
926 			 */
927 			uint64_t head_dataset_obj;
928 			int error = get_head_and_birth_txg(spa, &zep,
929 			    se->se_bookmark.zb_objset, &head_dataset_obj);
930 			if (error != 0)
931 				continue;
932 
933 			uint64_t err_obj;
934 			error = zap_lookup_int_key(spa->spa_meta_objset,
935 			    *obj, head_dataset_obj, &err_obj);
936 
937 			if (error == ENOENT) {
938 				err_obj = zap_create(spa->spa_meta_objset,
939 				    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
940 
941 				(void) zap_update_int_key(spa->spa_meta_objset,
942 				    *obj, head_dataset_obj, err_obj, tx);
943 			}
944 			errphys_to_name(&zep, buf, sizeof (buf));
945 
946 			(void) zap_update(spa->spa_meta_objset,
947 			    err_obj, buf, 1, strlen(name) + 1, name, tx);
948 		}
949 	}
950 	/* purge the error list */
951 	cookie = NULL;
952 	while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
953 		kmem_free(se, sizeof (spa_error_entry_t));
954 }
955 
956 static void
957 delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
958 {
959 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
960 		zap_cursor_t zc;
961 		zap_attribute_t za;
962 		for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
963 		    zap_cursor_retrieve(&zc, &za) == 0;
964 		    zap_cursor_advance(&zc)) {
965 			VERIFY0(dmu_object_free(spa->spa_meta_objset,
966 			    za.za_first_integer, tx));
967 		}
968 		zap_cursor_fini(&zc);
969 	}
970 	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
971 }
972 
973 /*
974  * Sync the error log out to disk.  This is a little tricky because the act of
975  * writing the error log requires the spa_errlist_lock.  So, we need to lock the
976  * error lists, take a copy of the lists, and then reinitialize them.  Then, we
977  * drop the error list lock and take the error log lock, at which point we
978  * do the errlog processing.  Then, if we encounter an I/O error during this
979  * process, we can successfully add the error to the list.  Note that this will
980  * result in the perpetual recycling of errors, but it is an unlikely situation
981  * and not a performance critical operation.
982  */
983 void
984 spa_errlog_sync(spa_t *spa, uint64_t txg)
985 {
986 	dmu_tx_t *tx;
987 	avl_tree_t scrub, last;
988 	int scrub_finished;
989 
990 	mutex_enter(&spa->spa_errlist_lock);
991 
992 	/*
993 	 * Bail out early under normal circumstances.
994 	 */
995 	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
996 	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
997 	    !spa->spa_scrub_finished) {
998 		mutex_exit(&spa->spa_errlist_lock);
999 		return;
1000 	}
1001 
1002 	spa_get_errlists(spa, &last, &scrub);
1003 	scrub_finished = spa->spa_scrub_finished;
1004 	spa->spa_scrub_finished = B_FALSE;
1005 
1006 	mutex_exit(&spa->spa_errlist_lock);
1007 	mutex_enter(&spa->spa_errlog_lock);
1008 
1009 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1010 
1011 	/*
1012 	 * Sync out the current list of errors.
1013 	 */
1014 	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
1015 
1016 	/*
1017 	 * Rotate the log if necessary.
1018 	 */
1019 	if (scrub_finished) {
1020 		if (spa->spa_errlog_last != 0)
1021 			delete_errlog(spa, spa->spa_errlog_last, tx);
1022 		spa->spa_errlog_last = spa->spa_errlog_scrub;
1023 		spa->spa_errlog_scrub = 0;
1024 
1025 		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
1026 	}
1027 
1028 	/*
1029 	 * Sync out any pending scrub errors.
1030 	 */
1031 	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
1032 
1033 	/*
1034 	 * Update the MOS to reflect the new values.
1035 	 */
1036 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1037 	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
1038 	    &spa->spa_errlog_last, tx);
1039 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1040 	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
1041 	    &spa->spa_errlog_scrub, tx);
1042 
1043 	dmu_tx_commit(tx);
1044 
1045 	mutex_exit(&spa->spa_errlog_lock);
1046 }
1047 
1048 static void
1049 delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
1050     dmu_tx_t *tx)
1051 {
1052 	if (spa_err_obj == 0)
1053 		return;
1054 
1055 	zap_cursor_t zc;
1056 	zap_attribute_t za;
1057 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
1058 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
1059 		uint64_t head_ds;
1060 		name_to_object(za.za_name, &head_ds);
1061 		if (head_ds == ds) {
1062 			(void) zap_remove(spa->spa_meta_objset, spa_err_obj,
1063 			    za.za_name, tx);
1064 			VERIFY0(dmu_object_free(spa->spa_meta_objset,
1065 			    za.za_first_integer, tx));
1066 			break;
1067 		}
1068 	}
1069 	zap_cursor_fini(&zc);
1070 }
1071 
1072 void
1073 spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
1074 {
1075 	mutex_enter(&spa->spa_errlog_lock);
1076 	delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
1077 	delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
1078 	mutex_exit(&spa->spa_errlog_lock);
1079 }
1080 
1081 static int
1082 find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
1083     uint64_t *txg)
1084 {
1085 	dsl_dataset_t *ds;
1086 	dsl_pool_t *dp = spa->spa_dsl_pool;
1087 
1088 	int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds);
1089 	if (error != 0)
1090 		return (error);
1091 
1092 	uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1093 	uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1094 
1095 	while (prev_obj != 0) {
1096 		dsl_dataset_rele(ds, FTAG);
1097 		if ((error = dsl_dataset_hold_obj(dp, prev_obj,
1098 		    FTAG, &ds)) == 0 &&
1099 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
1100 			break;
1101 
1102 		if (error != 0)
1103 			return (error);
1104 
1105 		prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1106 		prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1107 	}
1108 	dsl_dataset_rele(ds, FTAG);
1109 	ASSERT(prev_obj != 0);
1110 	*txg = prev_obj_txg;
1111 	return (0);
1112 }
1113 
1114 static void
1115 swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
1116     old_head, dmu_tx_t *tx)
1117 {
1118 	if (spa_err_obj == 0)
1119 		return;
1120 
1121 	uint64_t old_head_errlog;
1122 	int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
1123 	    old_head, &old_head_errlog);
1124 
1125 	/* If no error log, then there is nothing to do. */
1126 	if (error != 0)
1127 		return;
1128 
1129 	uint64_t txg;
1130 	error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
1131 	if (error != 0)
1132 		return;
1133 
1134 	/*
1135 	 * Create an error log if the file system being promoted does not
1136 	 * already have one.
1137 	 */
1138 	uint64_t new_head_errlog;
1139 	error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
1140 	    &new_head_errlog);
1141 
1142 	if (error != 0) {
1143 		new_head_errlog = zap_create(spa->spa_meta_objset,
1144 		    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
1145 
1146 		(void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
1147 		    new_head, new_head_errlog, tx);
1148 	}
1149 
1150 	zap_cursor_t zc;
1151 	zap_attribute_t za;
1152 	zbookmark_err_phys_t err_block;
1153 	for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
1154 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
1155 
1156 		char *name = "";
1157 		name_to_errphys(za.za_name, &err_block);
1158 		if (err_block.zb_birth < txg) {
1159 			(void) zap_update(spa->spa_meta_objset, new_head_errlog,
1160 			    za.za_name, 1, strlen(name) + 1, name, tx);
1161 
1162 			(void) zap_remove(spa->spa_meta_objset, old_head_errlog,
1163 			    za.za_name, tx);
1164 		}
1165 	}
1166 	zap_cursor_fini(&zc);
1167 }
1168 
1169 void
1170 spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
1171     dmu_tx_t *tx)
1172 {
1173 	mutex_enter(&spa->spa_errlog_lock);
1174 	swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
1175 	swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
1176 	mutex_exit(&spa->spa_errlog_lock);
1177 }
1178 
1179 #if defined(_KERNEL)
1180 /* error handling */
1181 EXPORT_SYMBOL(spa_log_error);
1182 EXPORT_SYMBOL(spa_get_errlog_size);
1183 EXPORT_SYMBOL(spa_get_errlog);
1184 EXPORT_SYMBOL(spa_errlog_rotate);
1185 EXPORT_SYMBOL(spa_errlog_drain);
1186 EXPORT_SYMBOL(spa_errlog_sync);
1187 EXPORT_SYMBOL(spa_get_errlists);
1188 EXPORT_SYMBOL(spa_delete_dataset_errlog);
1189 EXPORT_SYMBOL(spa_swap_errlog);
1190 EXPORT_SYMBOL(sync_error_list);
1191 EXPORT_SYMBOL(spa_upgrade_errlog);
1192 #endif
1193 
1194 /* BEGIN CSTYLED */
1195 ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, INT, ZMOD_RW,
1196 	"Limit the number of errors which will be upgraded to the new "
1197 	"on-disk error log when enabling head_errlog");
1198 /* END CSTYLED */
1199