1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
33 #include <sys/spa.h>
34 #include <sys/zio.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/sa.h>
37 #include <sys/sa_impl.h>
38 #include <sys/callb.h>
39 
40 int zfs_pd_blks_max = 100;
41 
42 typedef struct prefetch_data {
43 	kmutex_t pd_mtx;
44 	kcondvar_t pd_cv;
45 	int pd_blks_max;
46 	int pd_blks_fetched;
47 	int pd_flags;
48 	boolean_t pd_cancel;
49 	boolean_t pd_exited;
50 } prefetch_data_t;
51 
52 typedef struct traverse_data {
53 	spa_t *td_spa;
54 	uint64_t td_objset;
55 	blkptr_t *td_rootbp;
56 	uint64_t td_min_txg;
57 	zbookmark_t *td_resume;
58 	int td_flags;
59 	prefetch_data_t *td_pfd;
60 	blkptr_cb_t *td_func;
61 	void *td_arg;
62 } traverse_data_t;
63 
64 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
65     arc_buf_t *buf, uint64_t objset, uint64_t object);
66 
67 static int
68 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
69 {
70 	traverse_data_t *td = arg;
71 	zbookmark_t zb;
72 
73 	if (bp->blk_birth == 0)
74 		return (0);
75 
76 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
77 		return (0);
78 
79 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
80 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
81 
82 	(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
83 
84 	return (0);
85 }
86 
87 static int
88 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
89 {
90 	traverse_data_t *td = arg;
91 
92 	if (lrc->lrc_txtype == TX_WRITE) {
93 		lr_write_t *lr = (lr_write_t *)lrc;
94 		blkptr_t *bp = &lr->lr_blkptr;
95 		zbookmark_t zb;
96 
97 		if (bp->blk_birth == 0)
98 			return (0);
99 
100 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
101 			return (0);
102 
103 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
104 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
105 
106 		(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
107 		    td->td_arg);
108 	}
109 	return (0);
110 }
111 
112 static void
113 traverse_zil(traverse_data_t *td, zil_header_t *zh)
114 {
115 	uint64_t claim_txg = zh->zh_claim_txg;
116 	zilog_t *zilog;
117 
118 	/*
119 	 * We only want to visit blocks that have been claimed but not yet
120 	 * replayed; plus, in read-only mode, blocks that are already stable.
121 	 */
122 	if (claim_txg == 0 && spa_writeable(td->td_spa))
123 		return;
124 
125 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
126 
127 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
128 	    claim_txg);
129 
130 	zil_free(zilog);
131 }
132 
133 typedef enum resume_skip {
134 	RESUME_SKIP_ALL,
135 	RESUME_SKIP_NONE,
136 	RESUME_SKIP_CHILDREN
137 } resume_skip_t;
138 
139 /*
140  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
141  * the block indicated by zb does not need to be visited at all. Returns
142  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
143  * resume point. This indicates that this block should be visited but not its
144  * children (since they must have been visited in a previous traversal).
145  * Otherwise returns RESUME_SKIP_NONE.
146  */
147 static resume_skip_t
148 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
149     const zbookmark_t *zb)
150 {
151 	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
152 		/*
153 		 * If we already visited this bp & everything below,
154 		 * don't bother doing it again.
155 		 */
156 		if (zbookmark_is_before(dnp, zb, td->td_resume))
157 			return (RESUME_SKIP_ALL);
158 
159 		/*
160 		 * If we found the block we're trying to resume from, zero
161 		 * the bookmark out to indicate that we have resumed.
162 		 */
163 		ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
164 		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
165 			bzero(td->td_resume, sizeof (*zb));
166 			if (td->td_flags & TRAVERSE_POST)
167 				return (RESUME_SKIP_CHILDREN);
168 		}
169 	}
170 	return (RESUME_SKIP_NONE);
171 }
172 
173 static void
174 traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
175 {
176 	ASSERT(td->td_resume != NULL);
177 	ASSERT0(zb->zb_level);
178 	bcopy(zb, td->td_resume, sizeof (*td->td_resume));
179 }
180 
181 static int
182 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
183     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
184 {
185 	zbookmark_t czb;
186 	int err = 0, lasterr = 0;
187 	arc_buf_t *buf = NULL;
188 	prefetch_data_t *pd = td->td_pfd;
189 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
190 	boolean_t pause = B_FALSE;
191 
192 	switch (resume_skip_check(td, dnp, zb)) {
193 	case RESUME_SKIP_ALL:
194 		return (0);
195 	case RESUME_SKIP_CHILDREN:
196 		goto post;
197 	case RESUME_SKIP_NONE:
198 		break;
199 	default:
200 		ASSERT(0);
201 	}
202 
203 	if (BP_IS_HOLE(bp)) {
204 		err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
205 		    td->td_arg);
206 		return (err);
207 	}
208 
209 	if (bp->blk_birth <= td->td_min_txg)
210 		return (0);
211 
212 	if (pd && !pd->pd_exited &&
213 	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
214 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
215 		mutex_enter(&pd->pd_mtx);
216 		ASSERT(pd->pd_blks_fetched >= 0);
217 		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
218 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
219 		pd->pd_blks_fetched--;
220 		cv_broadcast(&pd->pd_cv);
221 		mutex_exit(&pd->pd_mtx);
222 	}
223 
224 	if (td->td_flags & TRAVERSE_PRE) {
225 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
226 		    td->td_arg);
227 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
228 			return (0);
229 		if (err == ERESTART)
230 			pause = B_TRUE; /* handle pausing at a common point */
231 		if (err != 0)
232 			goto post;
233 	}
234 
235 	if (BP_GET_LEVEL(bp) > 0) {
236 		uint32_t flags = ARC_WAIT;
237 		int i;
238 		blkptr_t *cbp;
239 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
240 
241 		err = dsl_read(NULL, td->td_spa, bp, pbuf,
242 		    arc_getbuf_func, &buf,
243 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
244 		if (err)
245 			return (err);
246 
247 		/* recursively visitbp() blocks below this */
248 		cbp = buf->b_data;
249 		for (i = 0; i < epb; i++, cbp++) {
250 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
251 			    zb->zb_level - 1,
252 			    zb->zb_blkid * epb + i);
253 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
254 			if (err) {
255 				if (!hard)
256 					break;
257 				lasterr = err;
258 			}
259 		}
260 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
261 		uint32_t flags = ARC_WAIT;
262 		int i;
263 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
264 
265 		err = dsl_read(NULL, td->td_spa, bp, pbuf,
266 		    arc_getbuf_func, &buf,
267 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
268 		if (err)
269 			return (err);
270 
271 		/* recursively visitbp() blocks below this */
272 		dnp = buf->b_data;
273 		for (i = 0; i < epb; i++, dnp++) {
274 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
275 			    zb->zb_blkid * epb + i);
276 			if (err) {
277 				if (!hard)
278 					break;
279 				lasterr = err;
280 			}
281 		}
282 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
283 		uint32_t flags = ARC_WAIT;
284 		objset_phys_t *osp;
285 		dnode_phys_t *dnp;
286 
287 		err = dsl_read_nolock(NULL, td->td_spa, bp,
288 		    arc_getbuf_func, &buf,
289 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
290 		if (err)
291 			return (err);
292 
293 		osp = buf->b_data;
294 		dnp = &osp->os_meta_dnode;
295 		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
296 		    DMU_META_DNODE_OBJECT);
297 		if (err && hard) {
298 			lasterr = err;
299 			err = 0;
300 		}
301 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
302 			dnp = &osp->os_userused_dnode;
303 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
304 			    DMU_USERUSED_OBJECT);
305 		}
306 		if (err && hard) {
307 			lasterr = err;
308 			err = 0;
309 		}
310 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
311 			dnp = &osp->os_groupused_dnode;
312 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
313 			    DMU_GROUPUSED_OBJECT);
314 		}
315 	}
316 
317 	if (buf)
318 		(void) arc_buf_remove_ref(buf, &buf);
319 
320 post:
321 	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
322 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
323 		    td->td_arg);
324 		if (err == ERESTART)
325 			pause = B_TRUE;
326 	}
327 
328 	if (pause && td->td_resume != NULL) {
329 		ASSERT3U(err, ==, ERESTART);
330 		ASSERT(!hard);
331 		traverse_pause(td, zb);
332 	}
333 
334 	return (err != 0 ? err : lasterr);
335 }
336 
337 static int
338 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
339     arc_buf_t *buf, uint64_t objset, uint64_t object)
340 {
341 	int j, err = 0, lasterr = 0;
342 	zbookmark_t czb;
343 	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
344 
345 	for (j = 0; j < dnp->dn_nblkptr; j++) {
346 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
347 		err = traverse_visitbp(td, dnp, buf,
348 		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
349 		if (err) {
350 			if (!hard)
351 				break;
352 			lasterr = err;
353 		}
354 	}
355 
356 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
357 		SET_BOOKMARK(&czb, objset,
358 		    object, 0, DMU_SPILL_BLKID);
359 		err = traverse_visitbp(td, dnp, buf,
360 		    (blkptr_t *)&dnp->dn_spill, &czb);
361 		if (err) {
362 			if (!hard)
363 				return (err);
364 			lasterr = err;
365 		}
366 	}
367 	return (err != 0 ? err : lasterr);
368 }
369 
370 /* ARGSUSED */
371 static int
372 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
373     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
374     void *arg)
375 {
376 	prefetch_data_t *pfd = arg;
377 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
378 
379 	ASSERT(pfd->pd_blks_fetched >= 0);
380 	if (pfd->pd_cancel)
381 		return (EINTR);
382 
383 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
384 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
385 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
386 		return (0);
387 
388 	mutex_enter(&pfd->pd_mtx);
389 	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
390 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
391 	pfd->pd_blks_fetched++;
392 	cv_broadcast(&pfd->pd_cv);
393 	mutex_exit(&pfd->pd_mtx);
394 
395 	(void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
396 	    ZIO_PRIORITY_ASYNC_READ,
397 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
398 	    &aflags, zb);
399 
400 	return (0);
401 }
402 
403 static void
404 traverse_prefetch_thread(void *arg)
405 {
406 	traverse_data_t *td_main = arg;
407 	traverse_data_t td = *td_main;
408 	zbookmark_t czb;
409 
410 	td.td_func = traverse_prefetcher;
411 	td.td_arg = td_main->td_pfd;
412 	td.td_pfd = NULL;
413 
414 	SET_BOOKMARK(&czb, td.td_objset,
415 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
416 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
417 
418 	mutex_enter(&td_main->td_pfd->pd_mtx);
419 	td_main->td_pfd->pd_exited = B_TRUE;
420 	cv_broadcast(&td_main->td_pfd->pd_cv);
421 	mutex_exit(&td_main->td_pfd->pd_mtx);
422 }
423 
424 /*
425  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
426  * in syncing context).
427  */
428 static int
429 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
430     uint64_t txg_start, zbookmark_t *resume, int flags,
431     blkptr_cb_t func, void *arg)
432 {
433 	traverse_data_t td;
434 	prefetch_data_t pd = { 0 };
435 	zbookmark_t czb;
436 	int err;
437 
438 	ASSERT(ds == NULL || objset == ds->ds_object);
439 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
440 
441 	td.td_spa = spa;
442 	td.td_objset = objset;
443 	td.td_rootbp = rootbp;
444 	td.td_min_txg = txg_start;
445 	td.td_resume = resume;
446 	td.td_func = func;
447 	td.td_arg = arg;
448 	td.td_pfd = &pd;
449 	td.td_flags = flags;
450 
451 	pd.pd_blks_max = zfs_pd_blks_max;
452 	pd.pd_flags = flags;
453 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
454 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
455 
456 	/* See comment on ZIL traversal in dsl_scan_visitds. */
457 	if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
458 		objset_t *os;
459 
460 		err = dmu_objset_from_ds(ds, &os);
461 		if (err)
462 			return (err);
463 
464 		traverse_zil(&td, &os->os_zil_header);
465 	}
466 
467 	if (!(flags & TRAVERSE_PREFETCH) ||
468 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
469 	    &td, TQ_NOQUEUE))
470 		pd.pd_exited = B_TRUE;
471 
472 	SET_BOOKMARK(&czb, td.td_objset,
473 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
474 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
475 
476 	mutex_enter(&pd.pd_mtx);
477 	pd.pd_cancel = B_TRUE;
478 	cv_broadcast(&pd.pd_cv);
479 	while (!pd.pd_exited)
480 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
481 	mutex_exit(&pd.pd_mtx);
482 
483 	mutex_destroy(&pd.pd_mtx);
484 	cv_destroy(&pd.pd_cv);
485 
486 	return (err);
487 }
488 
489 /*
490  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
491  * in syncing context).
492  */
493 int
494 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
495     blkptr_cb_t func, void *arg)
496 {
497 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
498 	    &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
499 }
500 
501 int
502 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
503     uint64_t txg_start, zbookmark_t *resume, int flags,
504     blkptr_cb_t func, void *arg)
505 {
506 	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
507 	    blkptr, txg_start, resume, flags, func, arg));
508 }
509 
510 /*
511  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
512  */
513 int
514 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
515     blkptr_cb_t func, void *arg)
516 {
517 	int err, lasterr = 0;
518 	uint64_t obj;
519 	dsl_pool_t *dp = spa_get_dsl(spa);
520 	objset_t *mos = dp->dp_meta_objset;
521 	boolean_t hard = (flags & TRAVERSE_HARD);
522 
523 	/* visit the MOS */
524 	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
525 	    txg_start, NULL, flags, func, arg);
526 	if (err)
527 		return (err);
528 
529 	/* visit each dataset */
530 	for (obj = 1; err == 0 || (err != ESRCH && hard);
531 	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
532 		dmu_object_info_t doi;
533 
534 		err = dmu_object_info(mos, obj, &doi);
535 		if (err) {
536 			if (!hard)
537 				return (err);
538 			lasterr = err;
539 			continue;
540 		}
541 
542 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
543 			dsl_dataset_t *ds;
544 			uint64_t txg = txg_start;
545 
546 			rw_enter(&dp->dp_config_rwlock, RW_READER);
547 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
548 			rw_exit(&dp->dp_config_rwlock);
549 			if (err) {
550 				if (!hard)
551 					return (err);
552 				lasterr = err;
553 				continue;
554 			}
555 			if (ds->ds_phys->ds_prev_snap_txg > txg)
556 				txg = ds->ds_phys->ds_prev_snap_txg;
557 			err = traverse_dataset(ds, txg, flags, func, arg);
558 			dsl_dataset_rele(ds, FTAG);
559 			if (err) {
560 				if (!hard)
561 					return (err);
562 				lasterr = err;
563 			}
564 		}
565 	}
566 	if (err == ESRCH)
567 		err = 0;
568 	return (err != 0 ? err : lasterr);
569 }
570