1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2016 Nexenta Systems, Inc.
27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
28 * Copyright (c) 2015, 2017, Intel Corporation.
29 * Copyright (c) 2020 Datto Inc.
30 * Copyright (c) 2020, The FreeBSD Foundation [1]
31 *
32 * [1] Portions of this software were developed by Allan Jude
33 * under sponsorship from the FreeBSD Foundation.
34 * Copyright (c) 2021 Allan Jude
35 * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
36 * Copyright (c) 2023, Klara Inc.
37 * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
38 */
39
40 #include <stdio.h>
41 #include <unistd.h>
42 #include <stdlib.h>
43 #include <ctype.h>
44 #include <getopt.h>
45 #include <openssl/evp.h>
46 #include <sys/zfs_context.h>
47 #include <sys/spa.h>
48 #include <sys/spa_impl.h>
49 #include <sys/dmu.h>
50 #include <sys/zap.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/zfs_znode.h>
53 #include <sys/zfs_sa.h>
54 #include <sys/sa.h>
55 #include <sys/sa_impl.h>
56 #include <sys/vdev.h>
57 #include <sys/vdev_impl.h>
58 #include <sys/metaslab_impl.h>
59 #include <sys/dmu_objset.h>
60 #include <sys/dsl_dir.h>
61 #include <sys/dsl_dataset.h>
62 #include <sys/dsl_pool.h>
63 #include <sys/dsl_bookmark.h>
64 #include <sys/dbuf.h>
65 #include <sys/zil.h>
66 #include <sys/zil_impl.h>
67 #include <sys/stat.h>
68 #include <sys/resource.h>
69 #include <sys/dmu_send.h>
70 #include <sys/dmu_traverse.h>
71 #include <sys/zio_checksum.h>
72 #include <sys/zio_compress.h>
73 #include <sys/zfs_fuid.h>
74 #include <sys/arc.h>
75 #include <sys/arc_impl.h>
76 #include <sys/ddt.h>
77 #include <sys/ddt_impl.h>
78 #include <sys/zfeature.h>
79 #include <sys/abd.h>
80 #include <sys/blkptr.h>
81 #include <sys/dsl_crypt.h>
82 #include <sys/dsl_scan.h>
83 #include <sys/btree.h>
84 #include <sys/brt.h>
85 #include <sys/brt_impl.h>
86 #include <zfs_comutil.h>
87 #include <sys/zstd/zstd.h>
88 #include <sys/backtrace.h>
89
90 #include <libnvpair.h>
91 #include <libzutil.h>
92
93 #include <libzdb.h>
94
95 #include "zdb.h"
96
97
98 extern int reference_tracking_enable;
99 extern int zfs_recover;
100 extern uint_t zfs_vdev_async_read_max_active;
101 extern boolean_t spa_load_verify_dryrun;
102 extern boolean_t spa_mode_readable_spacemaps;
103 extern uint_t zfs_reconstruct_indirect_combinations_max;
104 extern uint_t zfs_btree_verify_intensity;
105
106 static const char cmdname[] = "zdb";
107 uint8_t dump_opt[256];
108
109 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
110
111 static uint64_t *zopt_metaslab = NULL;
112 static unsigned zopt_metaslab_args = 0;
113
114
115 static zopt_object_range_t *zopt_object_ranges = NULL;
116 static unsigned zopt_object_args = 0;
117
118 static int flagbits[256];
119
120
121 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
122 static int leaked_objects = 0;
123 static range_tree_t *mos_refd_objs;
124 static spa_t *spa;
125 static objset_t *os;
126 static boolean_t kernel_init_done;
127
128 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
129 boolean_t);
130 static void mos_obj_refd(uint64_t);
131 static void mos_obj_refd_multiple(uint64_t);
132 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
133 dmu_tx_t *tx);
134
135
136
137 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
138 static void zdb_exit(int reason);
139
140 typedef struct sublivelist_verify_block_refcnt {
141 /* block pointer entry in livelist being verified */
142 blkptr_t svbr_blk;
143
144 /*
145 * Refcount gets incremented to 1 when we encounter the first
146 * FREE entry for the svfbr block pointer and a node for it
147 * is created in our ZDB verification/tracking metadata.
148 *
149 * As we encounter more FREE entries we increment this counter
150 * and similarly decrement it whenever we find the respective
151 * ALLOC entries for this block.
152 *
153 * When the refcount gets to 0 it means that all the FREE and
154 * ALLOC entries of this block have paired up and we no longer
155 * need to track it in our verification logic (e.g. the node
156 * containing this struct in our verification data structure
157 * should be freed).
158 *
159 * [refer to sublivelist_verify_blkptr() for the actual code]
160 */
161 uint32_t svbr_refcnt;
162 } sublivelist_verify_block_refcnt_t;
163
164 static int
sublivelist_block_refcnt_compare(const void * larg,const void * rarg)165 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
166 {
167 const sublivelist_verify_block_refcnt_t *l = larg;
168 const sublivelist_verify_block_refcnt_t *r = rarg;
169 return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
170 }
171
172 static int
sublivelist_verify_blkptr(void * arg,const blkptr_t * bp,boolean_t free,dmu_tx_t * tx)173 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
174 dmu_tx_t *tx)
175 {
176 ASSERT3P(tx, ==, NULL);
177 struct sublivelist_verify *sv = arg;
178 sublivelist_verify_block_refcnt_t current = {
179 .svbr_blk = *bp,
180
181 /*
182 * Start with 1 in case this is the first free entry.
183 * This field is not used for our B-Tree comparisons
184 * anyway.
185 */
186 .svbr_refcnt = 1,
187 };
188
189 zfs_btree_index_t where;
190 sublivelist_verify_block_refcnt_t *pair =
191 zfs_btree_find(&sv->sv_pair, ¤t, &where);
192 if (free) {
193 if (pair == NULL) {
194 /* first free entry for this block pointer */
195 zfs_btree_add(&sv->sv_pair, ¤t);
196 } else {
197 pair->svbr_refcnt++;
198 }
199 } else {
200 if (pair == NULL) {
201 /* block that is currently marked as allocated */
202 for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
203 if (DVA_IS_EMPTY(&bp->blk_dva[i]))
204 break;
205 sublivelist_verify_block_t svb = {
206 .svb_dva = bp->blk_dva[i],
207 .svb_allocated_txg =
208 BP_GET_LOGICAL_BIRTH(bp)
209 };
210
211 if (zfs_btree_find(&sv->sv_leftover, &svb,
212 &where) == NULL) {
213 zfs_btree_add_idx(&sv->sv_leftover,
214 &svb, &where);
215 }
216 }
217 } else {
218 /* alloc matches a free entry */
219 pair->svbr_refcnt--;
220 if (pair->svbr_refcnt == 0) {
221 /* all allocs and frees have been matched */
222 zfs_btree_remove_idx(&sv->sv_pair, &where);
223 }
224 }
225 }
226
227 return (0);
228 }
229
230 static int
sublivelist_verify_func(void * args,dsl_deadlist_entry_t * dle)231 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
232 {
233 int err;
234 struct sublivelist_verify *sv = args;
235
236 zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
237 sizeof (sublivelist_verify_block_refcnt_t));
238
239 err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
240 sv, NULL);
241
242 sublivelist_verify_block_refcnt_t *e;
243 zfs_btree_index_t *cookie = NULL;
244 while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
245 char blkbuf[BP_SPRINTF_LEN];
246 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
247 &e->svbr_blk, B_TRUE);
248 (void) printf("\tERROR: %d unmatched FREE(s): %s\n",
249 e->svbr_refcnt, blkbuf);
250 }
251 zfs_btree_destroy(&sv->sv_pair);
252
253 return (err);
254 }
255
256 static int
livelist_block_compare(const void * larg,const void * rarg)257 livelist_block_compare(const void *larg, const void *rarg)
258 {
259 const sublivelist_verify_block_t *l = larg;
260 const sublivelist_verify_block_t *r = rarg;
261
262 if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
263 return (-1);
264 else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
265 return (+1);
266
267 if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
268 return (-1);
269 else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
270 return (+1);
271
272 if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
273 return (-1);
274 else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
275 return (+1);
276
277 return (0);
278 }
279
280 /*
281 * Check for errors in a livelist while tracking all unfreed ALLOCs in the
282 * sublivelist_verify_t: sv->sv_leftover
283 */
284 static void
livelist_verify(dsl_deadlist_t * dl,void * arg)285 livelist_verify(dsl_deadlist_t *dl, void *arg)
286 {
287 sublivelist_verify_t *sv = arg;
288 dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
289 }
290
291 /*
292 * Check for errors in the livelist entry and discard the intermediary
293 * data structures
294 */
295 static int
sublivelist_verify_lightweight(void * args,dsl_deadlist_entry_t * dle)296 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
297 {
298 (void) args;
299 sublivelist_verify_t sv;
300 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
301 sizeof (sublivelist_verify_block_t));
302 int err = sublivelist_verify_func(&sv, dle);
303 zfs_btree_clear(&sv.sv_leftover);
304 zfs_btree_destroy(&sv.sv_leftover);
305 return (err);
306 }
307
308 typedef struct metaslab_verify {
309 /*
310 * Tree containing all the leftover ALLOCs from the livelists
311 * that are part of this metaslab.
312 */
313 zfs_btree_t mv_livelist_allocs;
314
315 /*
316 * Metaslab information.
317 */
318 uint64_t mv_vdid;
319 uint64_t mv_msid;
320 uint64_t mv_start;
321 uint64_t mv_end;
322
323 /*
324 * What's currently allocated for this metaslab.
325 */
326 range_tree_t *mv_allocated;
327 } metaslab_verify_t;
328
329 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
330
331 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
332 void *arg);
333
334 typedef struct unflushed_iter_cb_arg {
335 spa_t *uic_spa;
336 uint64_t uic_txg;
337 void *uic_arg;
338 zdb_log_sm_cb_t uic_cb;
339 } unflushed_iter_cb_arg_t;
340
341 static int
iterate_through_spacemap_logs_cb(space_map_entry_t * sme,void * arg)342 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
343 {
344 unflushed_iter_cb_arg_t *uic = arg;
345 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
346 }
347
348 static void
iterate_through_spacemap_logs(spa_t * spa,zdb_log_sm_cb_t cb,void * arg)349 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
350 {
351 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
352 return;
353
354 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
355 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
356 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
357 space_map_t *sm = NULL;
358 VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
359 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
360
361 unflushed_iter_cb_arg_t uic = {
362 .uic_spa = spa,
363 .uic_txg = sls->sls_txg,
364 .uic_arg = arg,
365 .uic_cb = cb
366 };
367 VERIFY0(space_map_iterate(sm, space_map_length(sm),
368 iterate_through_spacemap_logs_cb, &uic));
369 space_map_close(sm);
370 }
371 spa_config_exit(spa, SCL_CONFIG, FTAG);
372 }
373
374 static void
verify_livelist_allocs(metaslab_verify_t * mv,uint64_t txg,uint64_t offset,uint64_t size)375 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
376 uint64_t offset, uint64_t size)
377 {
378 sublivelist_verify_block_t svb = {{{0}}};
379 DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
380 DVA_SET_OFFSET(&svb.svb_dva, offset);
381 DVA_SET_ASIZE(&svb.svb_dva, size);
382 zfs_btree_index_t where;
383 uint64_t end_offset = offset + size;
384
385 /*
386 * Look for an exact match for spacemap entry in the livelist entries.
387 * Then, look for other livelist entries that fall within the range
388 * of the spacemap entry as it may have been condensed
389 */
390 sublivelist_verify_block_t *found =
391 zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
392 if (found == NULL) {
393 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
394 }
395 for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
396 DVA_GET_OFFSET(&found->svb_dva) < end_offset;
397 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
398 if (found->svb_allocated_txg <= txg) {
399 (void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
400 "from TXG %llx FREED at TXG %llx\n",
401 (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
402 (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
403 (u_longlong_t)found->svb_allocated_txg,
404 (u_longlong_t)txg);
405 }
406 }
407 }
408
409 static int
metaslab_spacemap_validation_cb(space_map_entry_t * sme,void * arg)410 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
411 {
412 metaslab_verify_t *mv = arg;
413 uint64_t offset = sme->sme_offset;
414 uint64_t size = sme->sme_run;
415 uint64_t txg = sme->sme_txg;
416
417 if (sme->sme_type == SM_ALLOC) {
418 if (range_tree_contains(mv->mv_allocated,
419 offset, size)) {
420 (void) printf("ERROR: DOUBLE ALLOC: "
421 "%llu [%llx:%llx] "
422 "%llu:%llu LOG_SM\n",
423 (u_longlong_t)txg, (u_longlong_t)offset,
424 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
425 (u_longlong_t)mv->mv_msid);
426 } else {
427 range_tree_add(mv->mv_allocated,
428 offset, size);
429 }
430 } else {
431 if (!range_tree_contains(mv->mv_allocated,
432 offset, size)) {
433 (void) printf("ERROR: DOUBLE FREE: "
434 "%llu [%llx:%llx] "
435 "%llu:%llu LOG_SM\n",
436 (u_longlong_t)txg, (u_longlong_t)offset,
437 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
438 (u_longlong_t)mv->mv_msid);
439 } else {
440 range_tree_remove(mv->mv_allocated,
441 offset, size);
442 }
443 }
444
445 if (sme->sme_type != SM_ALLOC) {
446 /*
447 * If something is freed in the spacemap, verify that
448 * it is not listed as allocated in the livelist.
449 */
450 verify_livelist_allocs(mv, txg, offset, size);
451 }
452 return (0);
453 }
454
455 static int
spacemap_check_sm_log_cb(spa_t * spa,space_map_entry_t * sme,uint64_t txg,void * arg)456 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
457 uint64_t txg, void *arg)
458 {
459 metaslab_verify_t *mv = arg;
460 uint64_t offset = sme->sme_offset;
461 uint64_t vdev_id = sme->sme_vdev;
462
463 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
464
465 /* skip indirect vdevs */
466 if (!vdev_is_concrete(vd))
467 return (0);
468
469 if (vdev_id != mv->mv_vdid)
470 return (0);
471
472 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
473 if (ms->ms_id != mv->mv_msid)
474 return (0);
475
476 if (txg < metaslab_unflushed_txg(ms))
477 return (0);
478
479
480 ASSERT3U(txg, ==, sme->sme_txg);
481 return (metaslab_spacemap_validation_cb(sme, mv));
482 }
483
484 static void
spacemap_check_sm_log(spa_t * spa,metaslab_verify_t * mv)485 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
486 {
487 iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
488 }
489
490 static void
spacemap_check_ms_sm(space_map_t * sm,metaslab_verify_t * mv)491 spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv)
492 {
493 if (sm == NULL)
494 return;
495
496 VERIFY0(space_map_iterate(sm, space_map_length(sm),
497 metaslab_spacemap_validation_cb, mv));
498 }
499
500 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
501
502 /*
503 * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
504 * they are part of that metaslab (mv_msid).
505 */
506 static void
mv_populate_livelist_allocs(metaslab_verify_t * mv,sublivelist_verify_t * sv)507 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
508 {
509 zfs_btree_index_t where;
510 sublivelist_verify_block_t *svb;
511 ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
512 for (svb = zfs_btree_first(&sv->sv_leftover, &where);
513 svb != NULL;
514 svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
515 if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
516 continue;
517
518 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
519 (DVA_GET_OFFSET(&svb->svb_dva) +
520 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
521 (void) printf("ERROR: Found block that crosses "
522 "metaslab boundary: <%llu:%llx:%llx>\n",
523 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
524 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
525 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
526 continue;
527 }
528
529 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
530 continue;
531
532 if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
533 continue;
534
535 if ((DVA_GET_OFFSET(&svb->svb_dva) +
536 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
537 (void) printf("ERROR: Found block that crosses "
538 "metaslab boundary: <%llu:%llx:%llx>\n",
539 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
540 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
541 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
542 continue;
543 }
544
545 zfs_btree_add(&mv->mv_livelist_allocs, svb);
546 }
547
548 for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
549 svb != NULL;
550 svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
551 zfs_btree_remove(&sv->sv_leftover, svb);
552 }
553 }
554
555 /*
556 * [Livelist Check]
557 * Iterate through all the sublivelists and:
558 * - report leftover frees (**)
559 * - record leftover ALLOCs together with their TXG [see Cross Check]
560 *
561 * (**) Note: Double ALLOCs are valid in datasets that have dedup
562 * enabled. Similarly double FREEs are allowed as well but
563 * only if they pair up with a corresponding ALLOC entry once
564 * we our done with our sublivelist iteration.
565 *
566 * [Spacemap Check]
567 * for each metaslab:
568 * - iterate over spacemap and then the metaslab's entries in the
569 * spacemap log, then report any double FREEs and ALLOCs (do not
570 * blow up).
571 *
572 * [Cross Check]
573 * After finishing the Livelist Check phase and while being in the
574 * Spacemap Check phase, we find all the recorded leftover ALLOCs
575 * of the livelist check that are part of the metaslab that we are
576 * currently looking at in the Spacemap Check. We report any entries
577 * that are marked as ALLOCs in the livelists but have been actually
578 * freed (and potentially allocated again) after their TXG stamp in
579 * the spacemaps. Also report any ALLOCs from the livelists that
580 * belong to indirect vdevs (e.g. their vdev completed removal).
581 *
582 * Note that this will miss Log Spacemap entries that cancelled each other
583 * out before being flushed to the metaslab, so we are not guaranteed
584 * to match all erroneous ALLOCs.
585 */
586 static void
livelist_metaslab_validate(spa_t * spa)587 livelist_metaslab_validate(spa_t *spa)
588 {
589 (void) printf("Verifying deleted livelist entries\n");
590
591 sublivelist_verify_t sv;
592 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
593 sizeof (sublivelist_verify_block_t));
594 iterate_deleted_livelists(spa, livelist_verify, &sv);
595
596 (void) printf("Verifying metaslab entries\n");
597 vdev_t *rvd = spa->spa_root_vdev;
598 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
599 vdev_t *vd = rvd->vdev_child[c];
600
601 if (!vdev_is_concrete(vd))
602 continue;
603
604 for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
605 metaslab_t *m = vd->vdev_ms[mid];
606
607 (void) fprintf(stderr,
608 "\rverifying concrete vdev %llu, "
609 "metaslab %llu of %llu ...",
610 (longlong_t)vd->vdev_id,
611 (longlong_t)mid,
612 (longlong_t)vd->vdev_ms_count);
613
614 uint64_t shift, start;
615 range_seg_type_t type =
616 metaslab_calculate_range_tree_type(vd, m,
617 &start, &shift);
618 metaslab_verify_t mv;
619 mv.mv_allocated = range_tree_create(NULL,
620 type, NULL, start, shift);
621 mv.mv_vdid = vd->vdev_id;
622 mv.mv_msid = m->ms_id;
623 mv.mv_start = m->ms_start;
624 mv.mv_end = m->ms_start + m->ms_size;
625 zfs_btree_create(&mv.mv_livelist_allocs,
626 livelist_block_compare, NULL,
627 sizeof (sublivelist_verify_block_t));
628
629 mv_populate_livelist_allocs(&mv, &sv);
630
631 spacemap_check_ms_sm(m->ms_sm, &mv);
632 spacemap_check_sm_log(spa, &mv);
633
634 range_tree_vacate(mv.mv_allocated, NULL, NULL);
635 range_tree_destroy(mv.mv_allocated);
636 zfs_btree_clear(&mv.mv_livelist_allocs);
637 zfs_btree_destroy(&mv.mv_livelist_allocs);
638 }
639 }
640 (void) fprintf(stderr, "\n");
641
642 /*
643 * If there are any segments in the leftover tree after we walked
644 * through all the metaslabs in the concrete vdevs then this means
645 * that we have segments in the livelists that belong to indirect
646 * vdevs and are marked as allocated.
647 */
648 if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
649 zfs_btree_destroy(&sv.sv_leftover);
650 return;
651 }
652 (void) printf("ERROR: Found livelist blocks marked as allocated "
653 "for indirect vdevs:\n");
654
655 zfs_btree_index_t *where = NULL;
656 sublivelist_verify_block_t *svb;
657 while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
658 NULL) {
659 int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
660 ASSERT3U(vdev_id, <, rvd->vdev_children);
661 vdev_t *vd = rvd->vdev_child[vdev_id];
662 ASSERT(!vdev_is_concrete(vd));
663 (void) printf("<%d:%llx:%llx> TXG %llx\n",
664 vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
665 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
666 (u_longlong_t)svb->svb_allocated_txg);
667 }
668 (void) printf("\n");
669 zfs_btree_destroy(&sv.sv_leftover);
670 }
671
672 /*
673 * These libumem hooks provide a reasonable set of defaults for the allocator's
674 * debugging facilities.
675 */
676 const char *
_umem_debug_init(void)677 _umem_debug_init(void)
678 {
679 return ("default,verbose"); /* $UMEM_DEBUG setting */
680 }
681
682 const char *
_umem_logging_init(void)683 _umem_logging_init(void)
684 {
685 return ("fail,contents"); /* $UMEM_LOGGING setting */
686 }
687
688 static void
usage(void)689 usage(void)
690 {
691 (void) fprintf(stderr,
692 "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
693 "[-I <inflight I/Os>]\n"
694 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
695 "\t\t[-K <key>]\n"
696 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
697 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
698 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
699 "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
700 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
701 "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
702 "\t%s [-v] <bookmark>\n"
703 "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
704 "\t%s -l [-Aqu] <device>\n"
705 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
706 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
707 "\t%s -O [-K <key>] <dataset> <path>\n"
708 "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
709 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
710 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
711 "\t%s -E [-A] word0:word1:...:word15\n"
712 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
713 "<poolname>\n\n",
714 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
715 cmdname, cmdname, cmdname, cmdname, cmdname);
716
717 (void) fprintf(stderr, " Dataset name must include at least one "
718 "separator character '/' or '@'\n");
719 (void) fprintf(stderr, " If dataset name is specified, only that "
720 "dataset is dumped\n");
721 (void) fprintf(stderr, " If object numbers or object number "
722 "ranges are specified, only those\n"
723 " objects or ranges are dumped.\n\n");
724 (void) fprintf(stderr,
725 " Object ranges take the form <start>:<end>[:<flags>]\n"
726 " start Starting object number\n"
727 " end Ending object number, or -1 for no upper bound\n"
728 " flags Optional flags to select object types:\n"
729 " A All objects (this is the default)\n"
730 " d ZFS directories\n"
731 " f ZFS files \n"
732 " m SPA space maps\n"
733 " z ZAPs\n"
734 " - Negate effect of next flag\n\n");
735 (void) fprintf(stderr, " Options to control amount of output:\n");
736 (void) fprintf(stderr, " -b --block-stats "
737 "block statistics\n");
738 (void) fprintf(stderr, " -B --backup "
739 "backup stream\n");
740 (void) fprintf(stderr, " -c --checksum "
741 "checksum all metadata (twice for all data) blocks\n");
742 (void) fprintf(stderr, " -C --config "
743 "config (or cachefile if alone)\n");
744 (void) fprintf(stderr, " -d --datasets "
745 "dataset(s)\n");
746 (void) fprintf(stderr, " -D --dedup-stats "
747 "dedup statistics\n");
748 (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n"
749 " decode and display block "
750 "from an embedded block pointer\n");
751 (void) fprintf(stderr, " -h --history "
752 "pool history\n");
753 (void) fprintf(stderr, " -i --intent-logs "
754 "intent logs\n");
755 (void) fprintf(stderr, " -l --label "
756 "read label contents\n");
757 (void) fprintf(stderr, " -k --checkpointed-state "
758 "examine the checkpointed state of the pool\n");
759 (void) fprintf(stderr, " -L --disable-leak-tracking "
760 "disable leak tracking (do not load spacemaps)\n");
761 (void) fprintf(stderr, " -m --metaslabs "
762 "metaslabs\n");
763 (void) fprintf(stderr, " -M --metaslab-groups "
764 "metaslab groups\n");
765 (void) fprintf(stderr, " -O --object-lookups "
766 "perform object lookups by path\n");
767 (void) fprintf(stderr, " -r --copy-object "
768 "copy an object by path to file\n");
769 (void) fprintf(stderr, " -R --read-block "
770 "read and display block from a device\n");
771 (void) fprintf(stderr, " -s --io-stats "
772 "report stats on zdb's I/O\n");
773 (void) fprintf(stderr, " -S --simulate-dedup "
774 "simulate dedup to measure effect\n");
775 (void) fprintf(stderr, " -v --verbose "
776 "verbose (applies to all others)\n");
777 (void) fprintf(stderr, " -y --livelist "
778 "perform livelist and metaslab validation on any livelists being "
779 "deleted\n\n");
780 (void) fprintf(stderr, " Below options are intended for use "
781 "with other options:\n");
782 (void) fprintf(stderr, " -A --ignore-assertions "
783 "ignore assertions (-A), enable panic recovery (-AA) or both "
784 "(-AAA)\n");
785 (void) fprintf(stderr, " -e --exported "
786 "pool is exported/destroyed/has altroot/not in a cachefile\n");
787 (void) fprintf(stderr, " -F --automatic-rewind "
788 "attempt automatic rewind within safe range of transaction "
789 "groups\n");
790 (void) fprintf(stderr, " -G --dump-debug-msg "
791 "dump zfs_dbgmsg buffer before exiting\n");
792 (void) fprintf(stderr, " -I --inflight=INTEGER "
793 "specify the maximum number of checksumming I/Os "
794 "[default is 200]\n");
795 (void) fprintf(stderr, " -K --key=KEY "
796 "decryption key for encrypted dataset\n");
797 (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" "
798 "set global variable to an unsigned 32-bit integer\n");
799 (void) fprintf(stderr, " -p --path==PATH "
800 "use one or more with -e to specify path to vdev dir\n");
801 (void) fprintf(stderr, " -P --parseable "
802 "print numbers in parseable form\n");
803 (void) fprintf(stderr, " -q --skip-label "
804 "don't print label contents\n");
805 (void) fprintf(stderr, " -t --txg=INTEGER "
806 "highest txg to use when searching for uberblocks\n");
807 (void) fprintf(stderr, " -T --brt-stats "
808 "BRT statistics\n");
809 (void) fprintf(stderr, " -u --uberblock "
810 "uberblock\n");
811 (void) fprintf(stderr, " -U --cachefile=PATH "
812 "use alternate cachefile\n");
813 (void) fprintf(stderr, " -V --verbatim "
814 "do verbatim import\n");
815 (void) fprintf(stderr, " -x --dump-blocks=PATH "
816 "dump all read blocks into specified directory\n");
817 (void) fprintf(stderr, " -X --extreme-rewind "
818 "attempt extreme rewind (does not work with dataset)\n");
819 (void) fprintf(stderr, " -Y --all-reconstruction "
820 "attempt all reconstruction combinations for split blocks\n");
821 (void) fprintf(stderr, " -Z --zstd-headers "
822 "show ZSTD headers \n");
823 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
824 "to make only that option verbose\n");
825 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
826 zdb_exit(1);
827 }
828
829 static void
dump_debug_buffer(void)830 dump_debug_buffer(void)
831 {
832 ssize_t ret __attribute__((unused));
833
834 if (!dump_opt['G'])
835 return;
836 /*
837 * We use write() instead of printf() so that this function
838 * is safe to call from a signal handler.
839 */
840 ret = write(STDERR_FILENO, "\n", 1);
841 zfs_dbgmsg_print(STDERR_FILENO, "zdb");
842 }
843
sig_handler(int signo)844 static void sig_handler(int signo)
845 {
846 struct sigaction action;
847
848 libspl_backtrace(STDERR_FILENO);
849 dump_debug_buffer();
850
851 /*
852 * Restore default action and re-raise signal so SIGSEGV and
853 * SIGABRT can trigger a core dump.
854 */
855 action.sa_handler = SIG_DFL;
856 sigemptyset(&action.sa_mask);
857 action.sa_flags = 0;
858 (void) sigaction(signo, &action, NULL);
859 raise(signo);
860 }
861
862 /*
863 * Called for usage errors that are discovered after a call to spa_open(),
864 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
865 */
866
867 static void
fatal(const char * fmt,...)868 fatal(const char *fmt, ...)
869 {
870 va_list ap;
871
872 va_start(ap, fmt);
873 (void) fprintf(stderr, "%s: ", cmdname);
874 (void) vfprintf(stderr, fmt, ap);
875 va_end(ap);
876 (void) fprintf(stderr, "\n");
877
878 dump_debug_buffer();
879
880 zdb_exit(1);
881 }
882
883 static void
dump_packed_nvlist(objset_t * os,uint64_t object,void * data,size_t size)884 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
885 {
886 (void) size;
887 nvlist_t *nv;
888 size_t nvsize = *(uint64_t *)data;
889 char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
890
891 VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
892
893 VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
894
895 umem_free(packed, nvsize);
896
897 dump_nvlist(nv, 8);
898
899 nvlist_free(nv);
900 }
901
902 static void
dump_history_offsets(objset_t * os,uint64_t object,void * data,size_t size)903 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
904 {
905 (void) os, (void) object, (void) size;
906 spa_history_phys_t *shp = data;
907
908 if (shp == NULL)
909 return;
910
911 (void) printf("\t\tpool_create_len = %llu\n",
912 (u_longlong_t)shp->sh_pool_create_len);
913 (void) printf("\t\tphys_max_off = %llu\n",
914 (u_longlong_t)shp->sh_phys_max_off);
915 (void) printf("\t\tbof = %llu\n",
916 (u_longlong_t)shp->sh_bof);
917 (void) printf("\t\teof = %llu\n",
918 (u_longlong_t)shp->sh_eof);
919 (void) printf("\t\trecords_lost = %llu\n",
920 (u_longlong_t)shp->sh_records_lost);
921 }
922
923 static void
zdb_nicenum(uint64_t num,char * buf,size_t buflen)924 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
925 {
926 if (dump_opt['P'])
927 (void) snprintf(buf, buflen, "%llu", (longlong_t)num);
928 else
929 nicenum(num, buf, buflen);
930 }
931
932 static void
zdb_nicebytes(uint64_t bytes,char * buf,size_t buflen)933 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
934 {
935 if (dump_opt['P'])
936 (void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
937 else
938 zfs_nicebytes(bytes, buf, buflen);
939 }
940
941 static const char histo_stars[] = "****************************************";
942 static const uint64_t histo_width = sizeof (histo_stars) - 1;
943
944 static void
dump_histogram(const uint64_t * histo,int size,int offset)945 dump_histogram(const uint64_t *histo, int size, int offset)
946 {
947 int i;
948 int minidx = size - 1;
949 int maxidx = 0;
950 uint64_t max = 0;
951
952 for (i = 0; i < size; i++) {
953 if (histo[i] == 0)
954 continue;
955 if (histo[i] > max)
956 max = histo[i];
957 if (i > maxidx)
958 maxidx = i;
959 if (i < minidx)
960 minidx = i;
961 }
962
963 if (max < histo_width)
964 max = histo_width;
965
966 for (i = minidx; i <= maxidx; i++) {
967 (void) printf("\t\t\t%3u: %6llu %s\n",
968 i + offset, (u_longlong_t)histo[i],
969 &histo_stars[(max - histo[i]) * histo_width / max]);
970 }
971 }
972
973 static void
dump_zap_stats(objset_t * os,uint64_t object)974 dump_zap_stats(objset_t *os, uint64_t object)
975 {
976 int error;
977 zap_stats_t zs;
978
979 error = zap_get_stats(os, object, &zs);
980 if (error)
981 return;
982
983 if (zs.zs_ptrtbl_len == 0) {
984 ASSERT(zs.zs_num_blocks == 1);
985 (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
986 (u_longlong_t)zs.zs_blocksize,
987 (u_longlong_t)zs.zs_num_entries);
988 return;
989 }
990
991 (void) printf("\tFat ZAP stats:\n");
992
993 (void) printf("\t\tPointer table:\n");
994 (void) printf("\t\t\t%llu elements\n",
995 (u_longlong_t)zs.zs_ptrtbl_len);
996 (void) printf("\t\t\tzt_blk: %llu\n",
997 (u_longlong_t)zs.zs_ptrtbl_zt_blk);
998 (void) printf("\t\t\tzt_numblks: %llu\n",
999 (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
1000 (void) printf("\t\t\tzt_shift: %llu\n",
1001 (u_longlong_t)zs.zs_ptrtbl_zt_shift);
1002 (void) printf("\t\t\tzt_blks_copied: %llu\n",
1003 (u_longlong_t)zs.zs_ptrtbl_blks_copied);
1004 (void) printf("\t\t\tzt_nextblk: %llu\n",
1005 (u_longlong_t)zs.zs_ptrtbl_nextblk);
1006
1007 (void) printf("\t\tZAP entries: %llu\n",
1008 (u_longlong_t)zs.zs_num_entries);
1009 (void) printf("\t\tLeaf blocks: %llu\n",
1010 (u_longlong_t)zs.zs_num_leafs);
1011 (void) printf("\t\tTotal blocks: %llu\n",
1012 (u_longlong_t)zs.zs_num_blocks);
1013 (void) printf("\t\tzap_block_type: 0x%llx\n",
1014 (u_longlong_t)zs.zs_block_type);
1015 (void) printf("\t\tzap_magic: 0x%llx\n",
1016 (u_longlong_t)zs.zs_magic);
1017 (void) printf("\t\tzap_salt: 0x%llx\n",
1018 (u_longlong_t)zs.zs_salt);
1019
1020 (void) printf("\t\tLeafs with 2^n pointers:\n");
1021 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
1022
1023 (void) printf("\t\tBlocks with n*5 entries:\n");
1024 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
1025
1026 (void) printf("\t\tBlocks n/10 full:\n");
1027 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
1028
1029 (void) printf("\t\tEntries with n chunks:\n");
1030 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
1031
1032 (void) printf("\t\tBuckets with n entries:\n");
1033 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
1034 }
1035
1036 static void
dump_none(objset_t * os,uint64_t object,void * data,size_t size)1037 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
1038 {
1039 (void) os, (void) object, (void) data, (void) size;
1040 }
1041
1042 static void
dump_unknown(objset_t * os,uint64_t object,void * data,size_t size)1043 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
1044 {
1045 (void) os, (void) object, (void) data, (void) size;
1046 (void) printf("\tUNKNOWN OBJECT TYPE\n");
1047 }
1048
1049 static void
dump_uint8(objset_t * os,uint64_t object,void * data,size_t size)1050 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
1051 {
1052 (void) os, (void) object, (void) data, (void) size;
1053 }
1054
1055 static void
dump_uint64(objset_t * os,uint64_t object,void * data,size_t size)1056 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
1057 {
1058 uint64_t *arr;
1059 uint64_t oursize;
1060 if (dump_opt['d'] < 6)
1061 return;
1062
1063 if (data == NULL) {
1064 dmu_object_info_t doi;
1065
1066 VERIFY0(dmu_object_info(os, object, &doi));
1067 size = doi.doi_max_offset;
1068 /*
1069 * We cap the size at 1 mebibyte here to prevent
1070 * allocation failures and nigh-infinite printing if the
1071 * object is extremely large.
1072 */
1073 oursize = MIN(size, 1 << 20);
1074 arr = kmem_alloc(oursize, KM_SLEEP);
1075
1076 int err = dmu_read(os, object, 0, oursize, arr, 0);
1077 if (err != 0) {
1078 (void) printf("got error %u from dmu_read\n", err);
1079 kmem_free(arr, oursize);
1080 return;
1081 }
1082 } else {
1083 /*
1084 * Even though the allocation is already done in this code path,
1085 * we still cap the size to prevent excessive printing.
1086 */
1087 oursize = MIN(size, 1 << 20);
1088 arr = data;
1089 }
1090
1091 if (size == 0) {
1092 if (data == NULL)
1093 kmem_free(arr, oursize);
1094 (void) printf("\t\t[]\n");
1095 return;
1096 }
1097
1098 (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
1099 for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
1100 if (i % 4 != 0)
1101 (void) printf(", %0llx", (u_longlong_t)arr[i]);
1102 else
1103 (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
1104 }
1105 if (oursize != size)
1106 (void) printf(", ... ");
1107 (void) printf("]\n");
1108
1109 if (data == NULL)
1110 kmem_free(arr, oursize);
1111 }
1112
1113 static void
dump_zap(objset_t * os,uint64_t object,void * data,size_t size)1114 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
1115 {
1116 (void) data, (void) size;
1117 zap_cursor_t zc;
1118 zap_attribute_t attr;
1119 void *prop;
1120 unsigned i;
1121
1122 dump_zap_stats(os, object);
1123 (void) printf("\n");
1124
1125 for (zap_cursor_init(&zc, os, object);
1126 zap_cursor_retrieve(&zc, &attr) == 0;
1127 zap_cursor_advance(&zc)) {
1128 (void) printf("\t\t%s = ", attr.za_name);
1129 if (attr.za_num_integers == 0) {
1130 (void) printf("\n");
1131 continue;
1132 }
1133 prop = umem_zalloc(attr.za_num_integers *
1134 attr.za_integer_length, UMEM_NOFAIL);
1135 (void) zap_lookup(os, object, attr.za_name,
1136 attr.za_integer_length, attr.za_num_integers, prop);
1137 if (attr.za_integer_length == 1) {
1138 if (strcmp(attr.za_name,
1139 DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
1140 strcmp(attr.za_name,
1141 DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
1142 strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 ||
1143 strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
1144 strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) {
1145 uint8_t *u8 = prop;
1146
1147 for (i = 0; i < attr.za_num_integers; i++) {
1148 (void) printf("%02x", u8[i]);
1149 }
1150 } else {
1151 (void) printf("%s", (char *)prop);
1152 }
1153 } else {
1154 for (i = 0; i < attr.za_num_integers; i++) {
1155 switch (attr.za_integer_length) {
1156 case 2:
1157 (void) printf("%u ",
1158 ((uint16_t *)prop)[i]);
1159 break;
1160 case 4:
1161 (void) printf("%u ",
1162 ((uint32_t *)prop)[i]);
1163 break;
1164 case 8:
1165 (void) printf("%lld ",
1166 (u_longlong_t)((int64_t *)prop)[i]);
1167 break;
1168 }
1169 }
1170 }
1171 (void) printf("\n");
1172 umem_free(prop, attr.za_num_integers * attr.za_integer_length);
1173 }
1174 zap_cursor_fini(&zc);
1175 }
1176
1177 static void
dump_bpobj(objset_t * os,uint64_t object,void * data,size_t size)1178 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
1179 {
1180 bpobj_phys_t *bpop = data;
1181 uint64_t i;
1182 char bytes[32], comp[32], uncomp[32];
1183
1184 /* make sure the output won't get truncated */
1185 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
1186 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
1187 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
1188
1189 if (bpop == NULL)
1190 return;
1191
1192 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
1193 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
1194 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
1195
1196 (void) printf("\t\tnum_blkptrs = %llu\n",
1197 (u_longlong_t)bpop->bpo_num_blkptrs);
1198 (void) printf("\t\tbytes = %s\n", bytes);
1199 if (size >= BPOBJ_SIZE_V1) {
1200 (void) printf("\t\tcomp = %s\n", comp);
1201 (void) printf("\t\tuncomp = %s\n", uncomp);
1202 }
1203 if (size >= BPOBJ_SIZE_V2) {
1204 (void) printf("\t\tsubobjs = %llu\n",
1205 (u_longlong_t)bpop->bpo_subobjs);
1206 (void) printf("\t\tnum_subobjs = %llu\n",
1207 (u_longlong_t)bpop->bpo_num_subobjs);
1208 }
1209 if (size >= sizeof (*bpop)) {
1210 (void) printf("\t\tnum_freed = %llu\n",
1211 (u_longlong_t)bpop->bpo_num_freed);
1212 }
1213
1214 if (dump_opt['d'] < 5)
1215 return;
1216
1217 for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
1218 char blkbuf[BP_SPRINTF_LEN];
1219 blkptr_t bp;
1220
1221 int err = dmu_read(os, object,
1222 i * sizeof (bp), sizeof (bp), &bp, 0);
1223 if (err != 0) {
1224 (void) printf("got error %u from dmu_read\n", err);
1225 break;
1226 }
1227 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
1228 BP_GET_FREE(&bp));
1229 (void) printf("\t%s\n", blkbuf);
1230 }
1231 }
1232
1233 static void
dump_bpobj_subobjs(objset_t * os,uint64_t object,void * data,size_t size)1234 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
1235 {
1236 (void) data, (void) size;
1237 dmu_object_info_t doi;
1238 int64_t i;
1239
1240 VERIFY0(dmu_object_info(os, object, &doi));
1241 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
1242
1243 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
1244 if (err != 0) {
1245 (void) printf("got error %u from dmu_read\n", err);
1246 kmem_free(subobjs, doi.doi_max_offset);
1247 return;
1248 }
1249
1250 int64_t last_nonzero = -1;
1251 for (i = 0; i < doi.doi_max_offset / 8; i++) {
1252 if (subobjs[i] != 0)
1253 last_nonzero = i;
1254 }
1255
1256 for (i = 0; i <= last_nonzero; i++) {
1257 (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
1258 }
1259 kmem_free(subobjs, doi.doi_max_offset);
1260 }
1261
1262 static void
dump_ddt_zap(objset_t * os,uint64_t object,void * data,size_t size)1263 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
1264 {
1265 (void) data, (void) size;
1266 dump_zap_stats(os, object);
1267 /* contents are printed elsewhere, properly decoded */
1268 }
1269
1270 static void
dump_sa_attrs(objset_t * os,uint64_t object,void * data,size_t size)1271 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
1272 {
1273 (void) data, (void) size;
1274 zap_cursor_t zc;
1275 zap_attribute_t attr;
1276
1277 dump_zap_stats(os, object);
1278 (void) printf("\n");
1279
1280 for (zap_cursor_init(&zc, os, object);
1281 zap_cursor_retrieve(&zc, &attr) == 0;
1282 zap_cursor_advance(&zc)) {
1283 (void) printf("\t\t%s = ", attr.za_name);
1284 if (attr.za_num_integers == 0) {
1285 (void) printf("\n");
1286 continue;
1287 }
1288 (void) printf(" %llx : [%d:%d:%d]\n",
1289 (u_longlong_t)attr.za_first_integer,
1290 (int)ATTR_LENGTH(attr.za_first_integer),
1291 (int)ATTR_BSWAP(attr.za_first_integer),
1292 (int)ATTR_NUM(attr.za_first_integer));
1293 }
1294 zap_cursor_fini(&zc);
1295 }
1296
1297 static void
dump_sa_layouts(objset_t * os,uint64_t object,void * data,size_t size)1298 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
1299 {
1300 (void) data, (void) size;
1301 zap_cursor_t zc;
1302 zap_attribute_t attr;
1303 uint16_t *layout_attrs;
1304 unsigned i;
1305
1306 dump_zap_stats(os, object);
1307 (void) printf("\n");
1308
1309 for (zap_cursor_init(&zc, os, object);
1310 zap_cursor_retrieve(&zc, &attr) == 0;
1311 zap_cursor_advance(&zc)) {
1312 (void) printf("\t\t%s = [", attr.za_name);
1313 if (attr.za_num_integers == 0) {
1314 (void) printf("\n");
1315 continue;
1316 }
1317
1318 VERIFY(attr.za_integer_length == 2);
1319 layout_attrs = umem_zalloc(attr.za_num_integers *
1320 attr.za_integer_length, UMEM_NOFAIL);
1321
1322 VERIFY(zap_lookup(os, object, attr.za_name,
1323 attr.za_integer_length,
1324 attr.za_num_integers, layout_attrs) == 0);
1325
1326 for (i = 0; i != attr.za_num_integers; i++)
1327 (void) printf(" %d ", (int)layout_attrs[i]);
1328 (void) printf("]\n");
1329 umem_free(layout_attrs,
1330 attr.za_num_integers * attr.za_integer_length);
1331 }
1332 zap_cursor_fini(&zc);
1333 }
1334
1335 static void
dump_zpldir(objset_t * os,uint64_t object,void * data,size_t size)1336 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
1337 {
1338 (void) data, (void) size;
1339 zap_cursor_t zc;
1340 zap_attribute_t attr;
1341 const char *typenames[] = {
1342 /* 0 */ "not specified",
1343 /* 1 */ "FIFO",
1344 /* 2 */ "Character Device",
1345 /* 3 */ "3 (invalid)",
1346 /* 4 */ "Directory",
1347 /* 5 */ "5 (invalid)",
1348 /* 6 */ "Block Device",
1349 /* 7 */ "7 (invalid)",
1350 /* 8 */ "Regular File",
1351 /* 9 */ "9 (invalid)",
1352 /* 10 */ "Symbolic Link",
1353 /* 11 */ "11 (invalid)",
1354 /* 12 */ "Socket",
1355 /* 13 */ "Door",
1356 /* 14 */ "Event Port",
1357 /* 15 */ "15 (invalid)",
1358 };
1359
1360 dump_zap_stats(os, object);
1361 (void) printf("\n");
1362
1363 for (zap_cursor_init(&zc, os, object);
1364 zap_cursor_retrieve(&zc, &attr) == 0;
1365 zap_cursor_advance(&zc)) {
1366 (void) printf("\t\t%s = %lld (type: %s)\n",
1367 attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
1368 typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
1369 }
1370 zap_cursor_fini(&zc);
1371 }
1372
1373 static int
get_dtl_refcount(vdev_t * vd)1374 get_dtl_refcount(vdev_t *vd)
1375 {
1376 int refcount = 0;
1377
1378 if (vd->vdev_ops->vdev_op_leaf) {
1379 space_map_t *sm = vd->vdev_dtl_sm;
1380
1381 if (sm != NULL &&
1382 sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
1383 return (1);
1384 return (0);
1385 }
1386
1387 for (unsigned c = 0; c < vd->vdev_children; c++)
1388 refcount += get_dtl_refcount(vd->vdev_child[c]);
1389 return (refcount);
1390 }
1391
1392 static int
get_metaslab_refcount(vdev_t * vd)1393 get_metaslab_refcount(vdev_t *vd)
1394 {
1395 int refcount = 0;
1396
1397 if (vd->vdev_top == vd) {
1398 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1399 space_map_t *sm = vd->vdev_ms[m]->ms_sm;
1400
1401 if (sm != NULL &&
1402 sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
1403 refcount++;
1404 }
1405 }
1406 for (unsigned c = 0; c < vd->vdev_children; c++)
1407 refcount += get_metaslab_refcount(vd->vdev_child[c]);
1408
1409 return (refcount);
1410 }
1411
1412 static int
get_obsolete_refcount(vdev_t * vd)1413 get_obsolete_refcount(vdev_t *vd)
1414 {
1415 uint64_t obsolete_sm_object;
1416 int refcount = 0;
1417
1418 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1419 if (vd->vdev_top == vd && obsolete_sm_object != 0) {
1420 dmu_object_info_t doi;
1421 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
1422 obsolete_sm_object, &doi));
1423 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
1424 refcount++;
1425 }
1426 } else {
1427 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
1428 ASSERT3U(obsolete_sm_object, ==, 0);
1429 }
1430 for (unsigned c = 0; c < vd->vdev_children; c++) {
1431 refcount += get_obsolete_refcount(vd->vdev_child[c]);
1432 }
1433
1434 return (refcount);
1435 }
1436
1437 static int
get_prev_obsolete_spacemap_refcount(spa_t * spa)1438 get_prev_obsolete_spacemap_refcount(spa_t *spa)
1439 {
1440 uint64_t prev_obj =
1441 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
1442 if (prev_obj != 0) {
1443 dmu_object_info_t doi;
1444 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
1445 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
1446 return (1);
1447 }
1448 }
1449 return (0);
1450 }
1451
1452 static int
get_checkpoint_refcount(vdev_t * vd)1453 get_checkpoint_refcount(vdev_t *vd)
1454 {
1455 int refcount = 0;
1456
1457 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
1458 zap_contains(spa_meta_objset(vd->vdev_spa),
1459 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
1460 refcount++;
1461
1462 for (uint64_t c = 0; c < vd->vdev_children; c++)
1463 refcount += get_checkpoint_refcount(vd->vdev_child[c]);
1464
1465 return (refcount);
1466 }
1467
1468 static int
get_log_spacemap_refcount(spa_t * spa)1469 get_log_spacemap_refcount(spa_t *spa)
1470 {
1471 return (avl_numnodes(&spa->spa_sm_logs_by_txg));
1472 }
1473
1474 static int
verify_spacemap_refcounts(spa_t * spa)1475 verify_spacemap_refcounts(spa_t *spa)
1476 {
1477 uint64_t expected_refcount = 0;
1478 uint64_t actual_refcount;
1479
1480 (void) feature_get_refcount(spa,
1481 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
1482 &expected_refcount);
1483 actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
1484 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
1485 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
1486 actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
1487 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
1488 actual_refcount += get_log_spacemap_refcount(spa);
1489
1490 if (expected_refcount != actual_refcount) {
1491 (void) printf("space map refcount mismatch: expected %lld != "
1492 "actual %lld\n",
1493 (longlong_t)expected_refcount,
1494 (longlong_t)actual_refcount);
1495 return (2);
1496 }
1497 return (0);
1498 }
1499
1500 static void
dump_spacemap(objset_t * os,space_map_t * sm)1501 dump_spacemap(objset_t *os, space_map_t *sm)
1502 {
1503 const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
1504 "INVALID", "INVALID", "INVALID", "INVALID" };
1505
1506 if (sm == NULL)
1507 return;
1508
1509 (void) printf("space map object %llu:\n",
1510 (longlong_t)sm->sm_object);
1511 (void) printf(" smp_length = 0x%llx\n",
1512 (longlong_t)sm->sm_phys->smp_length);
1513 (void) printf(" smp_alloc = 0x%llx\n",
1514 (longlong_t)sm->sm_phys->smp_alloc);
1515
1516 if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
1517 return;
1518
1519 /*
1520 * Print out the freelist entries in both encoded and decoded form.
1521 */
1522 uint8_t mapshift = sm->sm_shift;
1523 int64_t alloc = 0;
1524 uint64_t word, entry_id = 0;
1525 for (uint64_t offset = 0; offset < space_map_length(sm);
1526 offset += sizeof (word)) {
1527
1528 VERIFY0(dmu_read(os, space_map_object(sm), offset,
1529 sizeof (word), &word, DMU_READ_PREFETCH));
1530
1531 if (sm_entry_is_debug(word)) {
1532 uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
1533 uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
1534 if (de_txg == 0) {
1535 (void) printf(
1536 "\t [%6llu] PADDING\n",
1537 (u_longlong_t)entry_id);
1538 } else {
1539 (void) printf(
1540 "\t [%6llu] %s: txg %llu pass %llu\n",
1541 (u_longlong_t)entry_id,
1542 ddata[SM_DEBUG_ACTION_DECODE(word)],
1543 (u_longlong_t)de_txg,
1544 (u_longlong_t)de_sync_pass);
1545 }
1546 entry_id++;
1547 continue;
1548 }
1549
1550 uint8_t words;
1551 char entry_type;
1552 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
1553
1554 if (sm_entry_is_single_word(word)) {
1555 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
1556 'A' : 'F';
1557 entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
1558 sm->sm_start;
1559 entry_run = SM_RUN_DECODE(word) << mapshift;
1560 words = 1;
1561 } else {
1562 /* it is a two-word entry so we read another word */
1563 ASSERT(sm_entry_is_double_word(word));
1564
1565 uint64_t extra_word;
1566 offset += sizeof (extra_word);
1567 VERIFY0(dmu_read(os, space_map_object(sm), offset,
1568 sizeof (extra_word), &extra_word,
1569 DMU_READ_PREFETCH));
1570
1571 ASSERT3U(offset, <=, space_map_length(sm));
1572
1573 entry_run = SM2_RUN_DECODE(word) << mapshift;
1574 entry_vdev = SM2_VDEV_DECODE(word);
1575 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
1576 'A' : 'F';
1577 entry_off = (SM2_OFFSET_DECODE(extra_word) <<
1578 mapshift) + sm->sm_start;
1579 words = 2;
1580 }
1581
1582 (void) printf("\t [%6llu] %c range:"
1583 " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
1584 (u_longlong_t)entry_id,
1585 entry_type, (u_longlong_t)entry_off,
1586 (u_longlong_t)(entry_off + entry_run),
1587 (u_longlong_t)entry_run,
1588 (u_longlong_t)entry_vdev, words);
1589
1590 if (entry_type == 'A')
1591 alloc += entry_run;
1592 else
1593 alloc -= entry_run;
1594 entry_id++;
1595 }
1596 if (alloc != space_map_allocated(sm)) {
1597 (void) printf("space_map_object alloc (%lld) INCONSISTENT "
1598 "with space map summary (%lld)\n",
1599 (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
1600 }
1601 }
1602
1603 static void
dump_metaslab_stats(metaslab_t * msp)1604 dump_metaslab_stats(metaslab_t *msp)
1605 {
1606 char maxbuf[32];
1607 range_tree_t *rt = msp->ms_allocatable;
1608 zfs_btree_t *t = &msp->ms_allocatable_by_size;
1609 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1610
1611 /* max sure nicenum has enough space */
1612 _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
1613
1614 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
1615
1616 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
1617 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
1618 "freepct", free_pct);
1619 (void) printf("\tIn-memory histogram:\n");
1620 dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1621 }
1622
1623 static void
dump_metaslab(metaslab_t * msp)1624 dump_metaslab(metaslab_t *msp)
1625 {
1626 vdev_t *vd = msp->ms_group->mg_vd;
1627 spa_t *spa = vd->vdev_spa;
1628 space_map_t *sm = msp->ms_sm;
1629 char freebuf[32];
1630
1631 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
1632 sizeof (freebuf));
1633
1634 (void) printf(
1635 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
1636 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
1637 (u_longlong_t)space_map_object(sm), freebuf);
1638
1639 if (dump_opt['m'] > 2 && !dump_opt['L']) {
1640 mutex_enter(&msp->ms_lock);
1641 VERIFY0(metaslab_load(msp));
1642 range_tree_stat_verify(msp->ms_allocatable);
1643 dump_metaslab_stats(msp);
1644 metaslab_unload(msp);
1645 mutex_exit(&msp->ms_lock);
1646 }
1647
1648 if (dump_opt['m'] > 1 && sm != NULL &&
1649 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
1650 /*
1651 * The space map histogram represents free space in chunks
1652 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
1653 */
1654 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
1655 (u_longlong_t)msp->ms_fragmentation);
1656 dump_histogram(sm->sm_phys->smp_histogram,
1657 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
1658 }
1659
1660 if (vd->vdev_ops == &vdev_draid_ops)
1661 ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
1662 else
1663 ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
1664
1665 dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
1666
1667 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
1668 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
1669 (u_longlong_t)metaslab_unflushed_txg(msp));
1670 }
1671 }
1672
1673 static void
print_vdev_metaslab_header(vdev_t * vd)1674 print_vdev_metaslab_header(vdev_t *vd)
1675 {
1676 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
1677 const char *bias_str = "";
1678 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
1679 bias_str = VDEV_ALLOC_BIAS_LOG;
1680 } else if (alloc_bias == VDEV_BIAS_SPECIAL) {
1681 bias_str = VDEV_ALLOC_BIAS_SPECIAL;
1682 } else if (alloc_bias == VDEV_BIAS_DEDUP) {
1683 bias_str = VDEV_ALLOC_BIAS_DEDUP;
1684 }
1685
1686 uint64_t ms_flush_data_obj = 0;
1687 if (vd->vdev_top_zap != 0) {
1688 int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
1689 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1690 sizeof (uint64_t), 1, &ms_flush_data_obj);
1691 if (error != ENOENT) {
1692 ASSERT0(error);
1693 }
1694 }
1695
1696 (void) printf("\tvdev %10llu %s",
1697 (u_longlong_t)vd->vdev_id, bias_str);
1698
1699 if (ms_flush_data_obj != 0) {
1700 (void) printf(" ms_unflushed_phys object %llu",
1701 (u_longlong_t)ms_flush_data_obj);
1702 }
1703
1704 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n",
1705 "metaslabs", (u_longlong_t)vd->vdev_ms_count,
1706 "offset", "spacemap", "free");
1707 (void) printf("\t%15s %19s %15s %12s\n",
1708 "---------------", "-------------------",
1709 "---------------", "------------");
1710 }
1711
1712 static void
dump_metaslab_groups(spa_t * spa,boolean_t show_special)1713 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
1714 {
1715 vdev_t *rvd = spa->spa_root_vdev;
1716 metaslab_class_t *mc = spa_normal_class(spa);
1717 metaslab_class_t *smc = spa_special_class(spa);
1718 uint64_t fragmentation;
1719
1720 metaslab_class_histogram_verify(mc);
1721
1722 for (unsigned c = 0; c < rvd->vdev_children; c++) {
1723 vdev_t *tvd = rvd->vdev_child[c];
1724 metaslab_group_t *mg = tvd->vdev_mg;
1725
1726 if (mg == NULL || (mg->mg_class != mc &&
1727 (!show_special || mg->mg_class != smc)))
1728 continue;
1729
1730 metaslab_group_histogram_verify(mg);
1731 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1732
1733 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
1734 "fragmentation",
1735 (u_longlong_t)tvd->vdev_id,
1736 (u_longlong_t)tvd->vdev_ms_count);
1737 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
1738 (void) printf("%3s\n", "-");
1739 } else {
1740 (void) printf("%3llu%%\n",
1741 (u_longlong_t)mg->mg_fragmentation);
1742 }
1743 dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1744 }
1745
1746 (void) printf("\tpool %s\tfragmentation", spa_name(spa));
1747 fragmentation = metaslab_class_fragmentation(mc);
1748 if (fragmentation == ZFS_FRAG_INVALID)
1749 (void) printf("\t%3s\n", "-");
1750 else
1751 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
1752 dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1753 }
1754
1755 static void
print_vdev_indirect(vdev_t * vd)1756 print_vdev_indirect(vdev_t *vd)
1757 {
1758 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
1759 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1760 vdev_indirect_births_t *vib = vd->vdev_indirect_births;
1761
1762 if (vim == NULL) {
1763 ASSERT3P(vib, ==, NULL);
1764 return;
1765 }
1766
1767 ASSERT3U(vdev_indirect_mapping_object(vim), ==,
1768 vic->vic_mapping_object);
1769 ASSERT3U(vdev_indirect_births_object(vib), ==,
1770 vic->vic_births_object);
1771
1772 (void) printf("indirect births obj %llu:\n",
1773 (longlong_t)vic->vic_births_object);
1774 (void) printf(" vib_count = %llu\n",
1775 (longlong_t)vdev_indirect_births_count(vib));
1776 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
1777 vdev_indirect_birth_entry_phys_t *cur_vibe =
1778 &vib->vib_entries[i];
1779 (void) printf("\toffset %llx -> txg %llu\n",
1780 (longlong_t)cur_vibe->vibe_offset,
1781 (longlong_t)cur_vibe->vibe_phys_birth_txg);
1782 }
1783 (void) printf("\n");
1784
1785 (void) printf("indirect mapping obj %llu:\n",
1786 (longlong_t)vic->vic_mapping_object);
1787 (void) printf(" vim_max_offset = 0x%llx\n",
1788 (longlong_t)vdev_indirect_mapping_max_offset(vim));
1789 (void) printf(" vim_bytes_mapped = 0x%llx\n",
1790 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
1791 (void) printf(" vim_count = %llu\n",
1792 (longlong_t)vdev_indirect_mapping_num_entries(vim));
1793
1794 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
1795 return;
1796
1797 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
1798
1799 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
1800 vdev_indirect_mapping_entry_phys_t *vimep =
1801 &vim->vim_entries[i];
1802 (void) printf("\t<%llx:%llx:%llx> -> "
1803 "<%llx:%llx:%llx> (%x obsolete)\n",
1804 (longlong_t)vd->vdev_id,
1805 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
1806 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1807 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
1808 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
1809 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1810 counts[i]);
1811 }
1812 (void) printf("\n");
1813
1814 uint64_t obsolete_sm_object;
1815 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1816 if (obsolete_sm_object != 0) {
1817 objset_t *mos = vd->vdev_spa->spa_meta_objset;
1818 (void) printf("obsolete space map object %llu:\n",
1819 (u_longlong_t)obsolete_sm_object);
1820 ASSERT(vd->vdev_obsolete_sm != NULL);
1821 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
1822 obsolete_sm_object);
1823 dump_spacemap(mos, vd->vdev_obsolete_sm);
1824 (void) printf("\n");
1825 }
1826 }
1827
1828 static void
dump_metaslabs(spa_t * spa)1829 dump_metaslabs(spa_t *spa)
1830 {
1831 vdev_t *vd, *rvd = spa->spa_root_vdev;
1832 uint64_t m, c = 0, children = rvd->vdev_children;
1833
1834 (void) printf("\nMetaslabs:\n");
1835
1836 if (!dump_opt['d'] && zopt_metaslab_args > 0) {
1837 c = zopt_metaslab[0];
1838
1839 if (c >= children)
1840 (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
1841
1842 if (zopt_metaslab_args > 1) {
1843 vd = rvd->vdev_child[c];
1844 print_vdev_metaslab_header(vd);
1845
1846 for (m = 1; m < zopt_metaslab_args; m++) {
1847 if (zopt_metaslab[m] < vd->vdev_ms_count)
1848 dump_metaslab(
1849 vd->vdev_ms[zopt_metaslab[m]]);
1850 else
1851 (void) fprintf(stderr, "bad metaslab "
1852 "number %llu\n",
1853 (u_longlong_t)zopt_metaslab[m]);
1854 }
1855 (void) printf("\n");
1856 return;
1857 }
1858 children = c + 1;
1859 }
1860 for (; c < children; c++) {
1861 vd = rvd->vdev_child[c];
1862 print_vdev_metaslab_header(vd);
1863
1864 print_vdev_indirect(vd);
1865
1866 for (m = 0; m < vd->vdev_ms_count; m++)
1867 dump_metaslab(vd->vdev_ms[m]);
1868 (void) printf("\n");
1869 }
1870 }
1871
1872 static void
dump_log_spacemaps(spa_t * spa)1873 dump_log_spacemaps(spa_t *spa)
1874 {
1875 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1876 return;
1877
1878 (void) printf("\nLog Space Maps in Pool:\n");
1879 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1880 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1881 space_map_t *sm = NULL;
1882 VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
1883 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
1884
1885 (void) printf("Log Spacemap object %llu txg %llu\n",
1886 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
1887 dump_spacemap(spa->spa_meta_objset, sm);
1888 space_map_close(sm);
1889 }
1890 (void) printf("\n");
1891 }
1892
1893 static void
dump_dde(const ddt_t * ddt,const ddt_entry_t * dde,uint64_t index)1894 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
1895 {
1896 const ddt_phys_t *ddp = dde->dde_phys;
1897 const ddt_key_t *ddk = &dde->dde_key;
1898 const char *types[4] = { "ditto", "single", "double", "triple" };
1899 char blkbuf[BP_SPRINTF_LEN];
1900 blkptr_t blk;
1901 int p;
1902
1903 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1904 if (ddp->ddp_phys_birth == 0)
1905 continue;
1906 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1907 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
1908 (void) printf("index %llx refcnt %llu %s %s\n",
1909 (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
1910 types[p], blkbuf);
1911 }
1912 }
1913
1914 static void
dump_dedup_ratio(const ddt_stat_t * dds)1915 dump_dedup_ratio(const ddt_stat_t *dds)
1916 {
1917 double rL, rP, rD, D, dedup, compress, copies;
1918
1919 if (dds->dds_blocks == 0)
1920 return;
1921
1922 rL = (double)dds->dds_ref_lsize;
1923 rP = (double)dds->dds_ref_psize;
1924 rD = (double)dds->dds_ref_dsize;
1925 D = (double)dds->dds_dsize;
1926
1927 dedup = rD / D;
1928 compress = rL / rP;
1929 copies = rD / rP;
1930
1931 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
1932 "dedup * compress / copies = %.2f\n\n",
1933 dedup, compress, copies, dedup * compress / copies);
1934 }
1935
1936 static void
dump_ddt(ddt_t * ddt,ddt_type_t type,ddt_class_t class)1937 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
1938 {
1939 char name[DDT_NAMELEN];
1940 ddt_entry_t dde;
1941 uint64_t walk = 0;
1942 dmu_object_info_t doi;
1943 uint64_t count, dspace, mspace;
1944 int error;
1945
1946 error = ddt_object_info(ddt, type, class, &doi);
1947
1948 if (error == ENOENT)
1949 return;
1950 ASSERT(error == 0);
1951
1952 error = ddt_object_count(ddt, type, class, &count);
1953 ASSERT(error == 0);
1954 if (count == 0)
1955 return;
1956
1957 dspace = doi.doi_physical_blocks_512 << 9;
1958 mspace = doi.doi_fill_count * doi.doi_data_block_size;
1959
1960 ddt_object_name(ddt, type, class, name);
1961
1962 (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
1963 name,
1964 (u_longlong_t)count,
1965 (u_longlong_t)(dspace / count),
1966 (u_longlong_t)(mspace / count));
1967
1968 if (dump_opt['D'] < 3)
1969 return;
1970
1971 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
1972
1973 if (dump_opt['D'] < 4)
1974 return;
1975
1976 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
1977 return;
1978
1979 (void) printf("%s contents:\n\n", name);
1980
1981 while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
1982 dump_dde(ddt, &dde, walk);
1983
1984 ASSERT3U(error, ==, ENOENT);
1985
1986 (void) printf("\n");
1987 }
1988
1989 static void
dump_all_ddts(spa_t * spa)1990 dump_all_ddts(spa_t *spa)
1991 {
1992 ddt_histogram_t ddh_total = {{{0}}};
1993 ddt_stat_t dds_total = {0};
1994
1995 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1996 ddt_t *ddt = spa->spa_ddt[c];
1997 if (!ddt)
1998 continue;
1999 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
2000 for (ddt_class_t class = 0; class < DDT_CLASSES;
2001 class++) {
2002 dump_ddt(ddt, type, class);
2003 }
2004 }
2005 }
2006
2007 ddt_get_dedup_stats(spa, &dds_total);
2008
2009 if (dds_total.dds_blocks == 0) {
2010 (void) printf("All DDTs are empty\n");
2011 return;
2012 }
2013
2014 (void) printf("\n");
2015
2016 if (dump_opt['D'] > 1) {
2017 (void) printf("DDT histogram (aggregated over all DDTs):\n");
2018 ddt_get_dedup_histogram(spa, &ddh_total);
2019 zpool_dump_ddt(&dds_total, &ddh_total);
2020 }
2021
2022 dump_dedup_ratio(&dds_total);
2023 }
2024
2025 static void
dump_brt(spa_t * spa)2026 dump_brt(spa_t *spa)
2027 {
2028 if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
2029 printf("BRT: unsupported on this pool\n");
2030 return;
2031 }
2032
2033 if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
2034 printf("BRT: empty\n");
2035 return;
2036 }
2037
2038 brt_t *brt = spa->spa_brt;
2039 VERIFY(brt);
2040
2041 char count[32], used[32], saved[32];
2042 zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
2043 zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
2044 uint64_t ratio = brt_get_ratio(spa);
2045 printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
2046 (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
2047
2048 if (dump_opt['T'] < 2)
2049 return;
2050
2051 for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
2052 brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
2053 if (brtvd == NULL)
2054 continue;
2055
2056 if (!brtvd->bv_initiated) {
2057 printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
2058 continue;
2059 }
2060
2061 zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
2062 zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
2063 zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
2064 printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
2065 vdevid, count, used, saved);
2066 }
2067
2068 if (dump_opt['T'] < 3)
2069 return;
2070
2071 char dva[64];
2072 printf("\n%-16s %-10s\n", "DVA", "REFCNT");
2073
2074 for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
2075 brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
2076 if (brtvd == NULL || !brtvd->bv_initiated)
2077 continue;
2078
2079 zap_cursor_t zc;
2080 zap_attribute_t za;
2081 for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
2082 zap_cursor_retrieve(&zc, &za) == 0;
2083 zap_cursor_advance(&zc)) {
2084 uint64_t offset = *(uint64_t *)za.za_name;
2085 uint64_t refcnt = za.za_first_integer;
2086
2087 snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
2088 (u_longlong_t)offset);
2089 printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
2090 }
2091 zap_cursor_fini(&zc);
2092 }
2093 }
2094
2095 static void
dump_dtl_seg(void * arg,uint64_t start,uint64_t size)2096 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
2097 {
2098 char *prefix = arg;
2099
2100 (void) printf("%s [%llu,%llu) length %llu\n",
2101 prefix,
2102 (u_longlong_t)start,
2103 (u_longlong_t)(start + size),
2104 (u_longlong_t)(size));
2105 }
2106
2107 static void
dump_dtl(vdev_t * vd,int indent)2108 dump_dtl(vdev_t *vd, int indent)
2109 {
2110 spa_t *spa = vd->vdev_spa;
2111 boolean_t required;
2112 const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
2113 "outage" };
2114 char prefix[256];
2115
2116 spa_vdev_state_enter(spa, SCL_NONE);
2117 required = vdev_dtl_required(vd);
2118 (void) spa_vdev_state_exit(spa, NULL, 0);
2119
2120 if (indent == 0)
2121 (void) printf("\nDirty time logs:\n\n");
2122
2123 (void) printf("\t%*s%s [%s]\n", indent, "",
2124 vd->vdev_path ? vd->vdev_path :
2125 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
2126 required ? "DTL-required" : "DTL-expendable");
2127
2128 for (int t = 0; t < DTL_TYPES; t++) {
2129 range_tree_t *rt = vd->vdev_dtl[t];
2130 if (range_tree_space(rt) == 0)
2131 continue;
2132 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
2133 indent + 2, "", name[t]);
2134 range_tree_walk(rt, dump_dtl_seg, prefix);
2135 if (dump_opt['d'] > 5 && vd->vdev_children == 0)
2136 dump_spacemap(spa->spa_meta_objset,
2137 vd->vdev_dtl_sm);
2138 }
2139
2140 for (unsigned c = 0; c < vd->vdev_children; c++)
2141 dump_dtl(vd->vdev_child[c], indent + 4);
2142 }
2143
2144 static void
dump_history(spa_t * spa)2145 dump_history(spa_t *spa)
2146 {
2147 nvlist_t **events = NULL;
2148 char *buf;
2149 uint64_t resid, len, off = 0;
2150 uint_t num = 0;
2151 int error;
2152 char tbuf[30];
2153
2154 if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
2155 (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
2156 __func__);
2157 return;
2158 }
2159
2160 do {
2161 len = SPA_OLD_MAXBLOCKSIZE;
2162
2163 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
2164 (void) fprintf(stderr, "Unable to read history: "
2165 "error %d\n", error);
2166 free(buf);
2167 return;
2168 }
2169
2170 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
2171 break;
2172
2173 off -= resid;
2174 } while (len != 0);
2175
2176 (void) printf("\nHistory:\n");
2177 for (unsigned i = 0; i < num; i++) {
2178 boolean_t printed = B_FALSE;
2179
2180 if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
2181 time_t tsec;
2182 struct tm t;
2183
2184 tsec = fnvlist_lookup_uint64(events[i],
2185 ZPOOL_HIST_TIME);
2186 (void) localtime_r(&tsec, &t);
2187 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
2188 } else {
2189 tbuf[0] = '\0';
2190 }
2191
2192 if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
2193 (void) printf("%s %s\n", tbuf,
2194 fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
2195 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
2196 uint64_t ievent;
2197
2198 ievent = fnvlist_lookup_uint64(events[i],
2199 ZPOOL_HIST_INT_EVENT);
2200 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
2201 goto next;
2202
2203 (void) printf(" %s [internal %s txg:%ju] %s\n",
2204 tbuf,
2205 zfs_history_event_names[ievent],
2206 fnvlist_lookup_uint64(events[i],
2207 ZPOOL_HIST_TXG),
2208 fnvlist_lookup_string(events[i],
2209 ZPOOL_HIST_INT_STR));
2210 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
2211 (void) printf("%s [txg:%ju] %s", tbuf,
2212 fnvlist_lookup_uint64(events[i],
2213 ZPOOL_HIST_TXG),
2214 fnvlist_lookup_string(events[i],
2215 ZPOOL_HIST_INT_NAME));
2216
2217 if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
2218 (void) printf(" %s (%llu)",
2219 fnvlist_lookup_string(events[i],
2220 ZPOOL_HIST_DSNAME),
2221 (u_longlong_t)fnvlist_lookup_uint64(
2222 events[i],
2223 ZPOOL_HIST_DSID));
2224 }
2225
2226 (void) printf(" %s\n", fnvlist_lookup_string(events[i],
2227 ZPOOL_HIST_INT_STR));
2228 } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
2229 (void) printf("%s ioctl %s\n", tbuf,
2230 fnvlist_lookup_string(events[i],
2231 ZPOOL_HIST_IOCTL));
2232
2233 if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
2234 (void) printf(" input:\n");
2235 dump_nvlist(fnvlist_lookup_nvlist(events[i],
2236 ZPOOL_HIST_INPUT_NVL), 8);
2237 }
2238 if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
2239 (void) printf(" output:\n");
2240 dump_nvlist(fnvlist_lookup_nvlist(events[i],
2241 ZPOOL_HIST_OUTPUT_NVL), 8);
2242 }
2243 if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
2244 (void) printf(" errno: %lld\n",
2245 (longlong_t)fnvlist_lookup_int64(events[i],
2246 ZPOOL_HIST_ERRNO));
2247 }
2248 } else {
2249 goto next;
2250 }
2251
2252 printed = B_TRUE;
2253 next:
2254 if (dump_opt['h'] > 1) {
2255 if (!printed)
2256 (void) printf("unrecognized record:\n");
2257 dump_nvlist(events[i], 2);
2258 }
2259 }
2260 free(buf);
2261 }
2262
2263 static void
dump_dnode(objset_t * os,uint64_t object,void * data,size_t size)2264 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
2265 {
2266 (void) os, (void) object, (void) data, (void) size;
2267 }
2268
2269 static uint64_t
blkid2offset(const dnode_phys_t * dnp,const blkptr_t * bp,const zbookmark_phys_t * zb)2270 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
2271 const zbookmark_phys_t *zb)
2272 {
2273 if (dnp == NULL) {
2274 ASSERT(zb->zb_level < 0);
2275 if (zb->zb_object == 0)
2276 return (zb->zb_blkid);
2277 return (zb->zb_blkid * BP_GET_LSIZE(bp));
2278 }
2279
2280 ASSERT(zb->zb_level >= 0);
2281
2282 return ((zb->zb_blkid <<
2283 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
2284 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
2285 }
2286
2287 static void
snprintf_zstd_header(spa_t * spa,char * blkbuf,size_t buflen,const blkptr_t * bp)2288 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
2289 const blkptr_t *bp)
2290 {
2291 static abd_t *pabd = NULL;
2292 void *buf;
2293 zio_t *zio;
2294 zfs_zstdhdr_t zstd_hdr;
2295 int error;
2296
2297 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
2298 return;
2299
2300 if (BP_IS_HOLE(bp))
2301 return;
2302
2303 if (BP_IS_EMBEDDED(bp)) {
2304 buf = malloc(SPA_MAXBLOCKSIZE);
2305 if (buf == NULL) {
2306 (void) fprintf(stderr, "out of memory\n");
2307 zdb_exit(1);
2308 }
2309 decode_embedded_bp_compressed(bp, buf);
2310 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
2311 free(buf);
2312 zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
2313 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
2314 (void) snprintf(blkbuf + strlen(blkbuf),
2315 buflen - strlen(blkbuf),
2316 " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
2317 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
2318 zfs_get_hdrlevel(&zstd_hdr));
2319 return;
2320 }
2321
2322 if (!pabd)
2323 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
2324 zio = zio_root(spa, NULL, NULL, 0);
2325
2326 /* Decrypt but don't decompress so we can read the compression header */
2327 zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
2328 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
2329 NULL));
2330 error = zio_wait(zio);
2331 if (error) {
2332 (void) fprintf(stderr, "read failed: %d\n", error);
2333 return;
2334 }
2335 buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
2336 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
2337 zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
2338 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
2339
2340 (void) snprintf(blkbuf + strlen(blkbuf),
2341 buflen - strlen(blkbuf),
2342 " ZSTD:size=%u:version=%u:level=%u:NORMAL",
2343 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
2344 zfs_get_hdrlevel(&zstd_hdr));
2345
2346 abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
2347 }
2348
2349 static void
snprintf_blkptr_compact(char * blkbuf,size_t buflen,const blkptr_t * bp,boolean_t bp_freed)2350 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
2351 boolean_t bp_freed)
2352 {
2353 const dva_t *dva = bp->blk_dva;
2354 int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
2355 int i;
2356
2357 if (dump_opt['b'] >= 6) {
2358 snprintf_blkptr(blkbuf, buflen, bp);
2359 if (bp_freed) {
2360 (void) snprintf(blkbuf + strlen(blkbuf),
2361 buflen - strlen(blkbuf), " %s", "FREE");
2362 }
2363 return;
2364 }
2365
2366 if (BP_IS_EMBEDDED(bp)) {
2367 (void) sprintf(blkbuf,
2368 "EMBEDDED et=%u %llxL/%llxP B=%llu",
2369 (int)BPE_GET_ETYPE(bp),
2370 (u_longlong_t)BPE_GET_LSIZE(bp),
2371 (u_longlong_t)BPE_GET_PSIZE(bp),
2372 (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
2373 return;
2374 }
2375
2376 blkbuf[0] = '\0';
2377
2378 for (i = 0; i < ndvas; i++)
2379 (void) snprintf(blkbuf + strlen(blkbuf),
2380 buflen - strlen(blkbuf), "%llu:%llx:%llx ",
2381 (u_longlong_t)DVA_GET_VDEV(&dva[i]),
2382 (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
2383 (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
2384
2385 if (BP_IS_HOLE(bp)) {
2386 (void) snprintf(blkbuf + strlen(blkbuf),
2387 buflen - strlen(blkbuf),
2388 "%llxL B=%llu",
2389 (u_longlong_t)BP_GET_LSIZE(bp),
2390 (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
2391 } else {
2392 (void) snprintf(blkbuf + strlen(blkbuf),
2393 buflen - strlen(blkbuf),
2394 "%llxL/%llxP F=%llu B=%llu/%llu",
2395 (u_longlong_t)BP_GET_LSIZE(bp),
2396 (u_longlong_t)BP_GET_PSIZE(bp),
2397 (u_longlong_t)BP_GET_FILL(bp),
2398 (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
2399 (u_longlong_t)BP_GET_BIRTH(bp));
2400 if (bp_freed)
2401 (void) snprintf(blkbuf + strlen(blkbuf),
2402 buflen - strlen(blkbuf), " %s", "FREE");
2403 (void) snprintf(blkbuf + strlen(blkbuf),
2404 buflen - strlen(blkbuf),
2405 " cksum=%016llx:%016llx:%016llx:%016llx",
2406 (u_longlong_t)bp->blk_cksum.zc_word[0],
2407 (u_longlong_t)bp->blk_cksum.zc_word[1],
2408 (u_longlong_t)bp->blk_cksum.zc_word[2],
2409 (u_longlong_t)bp->blk_cksum.zc_word[3]);
2410 }
2411 }
2412
2413 static void
print_indirect(spa_t * spa,blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp)2414 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
2415 const dnode_phys_t *dnp)
2416 {
2417 char blkbuf[BP_SPRINTF_LEN];
2418 int l;
2419
2420 if (!BP_IS_EMBEDDED(bp)) {
2421 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
2422 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
2423 }
2424
2425 (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
2426
2427 ASSERT(zb->zb_level >= 0);
2428
2429 for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
2430 if (l == zb->zb_level) {
2431 (void) printf("L%llx", (u_longlong_t)zb->zb_level);
2432 } else {
2433 (void) printf(" ");
2434 }
2435 }
2436
2437 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
2438 if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
2439 snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
2440 (void) printf("%s\n", blkbuf);
2441 }
2442
2443 static int
visit_indirect(spa_t * spa,const dnode_phys_t * dnp,blkptr_t * bp,const zbookmark_phys_t * zb)2444 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
2445 blkptr_t *bp, const zbookmark_phys_t *zb)
2446 {
2447 int err = 0;
2448
2449 if (BP_GET_LOGICAL_BIRTH(bp) == 0)
2450 return (0);
2451
2452 print_indirect(spa, bp, zb, dnp);
2453
2454 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
2455 arc_flags_t flags = ARC_FLAG_WAIT;
2456 int i;
2457 blkptr_t *cbp;
2458 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
2459 arc_buf_t *buf;
2460 uint64_t fill = 0;
2461 ASSERT(!BP_IS_REDACTED(bp));
2462
2463 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
2464 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
2465 if (err)
2466 return (err);
2467 ASSERT(buf->b_data);
2468
2469 /* recursively visit blocks below this */
2470 cbp = buf->b_data;
2471 for (i = 0; i < epb; i++, cbp++) {
2472 zbookmark_phys_t czb;
2473
2474 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
2475 zb->zb_level - 1,
2476 zb->zb_blkid * epb + i);
2477 err = visit_indirect(spa, dnp, cbp, &czb);
2478 if (err)
2479 break;
2480 fill += BP_GET_FILL(cbp);
2481 }
2482 if (!err)
2483 ASSERT3U(fill, ==, BP_GET_FILL(bp));
2484 arc_buf_destroy(buf, &buf);
2485 }
2486
2487 return (err);
2488 }
2489
2490 static void
dump_indirect(dnode_t * dn)2491 dump_indirect(dnode_t *dn)
2492 {
2493 dnode_phys_t *dnp = dn->dn_phys;
2494 zbookmark_phys_t czb;
2495
2496 (void) printf("Indirect blocks:\n");
2497
2498 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
2499 dn->dn_object, dnp->dn_nlevels - 1, 0);
2500 for (int j = 0; j < dnp->dn_nblkptr; j++) {
2501 czb.zb_blkid = j;
2502 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
2503 &dnp->dn_blkptr[j], &czb);
2504 }
2505
2506 (void) printf("\n");
2507 }
2508
2509 static void
dump_dsl_dir(objset_t * os,uint64_t object,void * data,size_t size)2510 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
2511 {
2512 (void) os, (void) object;
2513 dsl_dir_phys_t *dd = data;
2514 time_t crtime;
2515 char nice[32];
2516
2517 /* make sure nicenum has enough space */
2518 _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
2519
2520 if (dd == NULL)
2521 return;
2522
2523 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
2524
2525 crtime = dd->dd_creation_time;
2526 (void) printf("\t\tcreation_time = %s", ctime(&crtime));
2527 (void) printf("\t\thead_dataset_obj = %llu\n",
2528 (u_longlong_t)dd->dd_head_dataset_obj);
2529 (void) printf("\t\tparent_dir_obj = %llu\n",
2530 (u_longlong_t)dd->dd_parent_obj);
2531 (void) printf("\t\torigin_obj = %llu\n",
2532 (u_longlong_t)dd->dd_origin_obj);
2533 (void) printf("\t\tchild_dir_zapobj = %llu\n",
2534 (u_longlong_t)dd->dd_child_dir_zapobj);
2535 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
2536 (void) printf("\t\tused_bytes = %s\n", nice);
2537 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
2538 (void) printf("\t\tcompressed_bytes = %s\n", nice);
2539 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
2540 (void) printf("\t\tuncompressed_bytes = %s\n", nice);
2541 zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
2542 (void) printf("\t\tquota = %s\n", nice);
2543 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
2544 (void) printf("\t\treserved = %s\n", nice);
2545 (void) printf("\t\tprops_zapobj = %llu\n",
2546 (u_longlong_t)dd->dd_props_zapobj);
2547 (void) printf("\t\tdeleg_zapobj = %llu\n",
2548 (u_longlong_t)dd->dd_deleg_zapobj);
2549 (void) printf("\t\tflags = %llx\n",
2550 (u_longlong_t)dd->dd_flags);
2551
2552 #define DO(which) \
2553 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
2554 sizeof (nice)); \
2555 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
2556 DO(HEAD);
2557 DO(SNAP);
2558 DO(CHILD);
2559 DO(CHILD_RSRV);
2560 DO(REFRSRV);
2561 #undef DO
2562 (void) printf("\t\tclones = %llu\n",
2563 (u_longlong_t)dd->dd_clones);
2564 }
2565
2566 static void
dump_dsl_dataset(objset_t * os,uint64_t object,void * data,size_t size)2567 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
2568 {
2569 (void) os, (void) object;
2570 dsl_dataset_phys_t *ds = data;
2571 time_t crtime;
2572 char used[32], compressed[32], uncompressed[32], unique[32];
2573 char blkbuf[BP_SPRINTF_LEN];
2574
2575 /* make sure nicenum has enough space */
2576 _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
2577 _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
2578 "compressed truncated");
2579 _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
2580 "uncompressed truncated");
2581 _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
2582
2583 if (ds == NULL)
2584 return;
2585
2586 ASSERT(size == sizeof (*ds));
2587 crtime = ds->ds_creation_time;
2588 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
2589 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
2590 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
2591 sizeof (uncompressed));
2592 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
2593 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
2594
2595 (void) printf("\t\tdir_obj = %llu\n",
2596 (u_longlong_t)ds->ds_dir_obj);
2597 (void) printf("\t\tprev_snap_obj = %llu\n",
2598 (u_longlong_t)ds->ds_prev_snap_obj);
2599 (void) printf("\t\tprev_snap_txg = %llu\n",
2600 (u_longlong_t)ds->ds_prev_snap_txg);
2601 (void) printf("\t\tnext_snap_obj = %llu\n",
2602 (u_longlong_t)ds->ds_next_snap_obj);
2603 (void) printf("\t\tsnapnames_zapobj = %llu\n",
2604 (u_longlong_t)ds->ds_snapnames_zapobj);
2605 (void) printf("\t\tnum_children = %llu\n",
2606 (u_longlong_t)ds->ds_num_children);
2607 (void) printf("\t\tuserrefs_obj = %llu\n",
2608 (u_longlong_t)ds->ds_userrefs_obj);
2609 (void) printf("\t\tcreation_time = %s", ctime(&crtime));
2610 (void) printf("\t\tcreation_txg = %llu\n",
2611 (u_longlong_t)ds->ds_creation_txg);
2612 (void) printf("\t\tdeadlist_obj = %llu\n",
2613 (u_longlong_t)ds->ds_deadlist_obj);
2614 (void) printf("\t\tused_bytes = %s\n", used);
2615 (void) printf("\t\tcompressed_bytes = %s\n", compressed);
2616 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
2617 (void) printf("\t\tunique = %s\n", unique);
2618 (void) printf("\t\tfsid_guid = %llu\n",
2619 (u_longlong_t)ds->ds_fsid_guid);
2620 (void) printf("\t\tguid = %llu\n",
2621 (u_longlong_t)ds->ds_guid);
2622 (void) printf("\t\tflags = %llx\n",
2623 (u_longlong_t)ds->ds_flags);
2624 (void) printf("\t\tnext_clones_obj = %llu\n",
2625 (u_longlong_t)ds->ds_next_clones_obj);
2626 (void) printf("\t\tprops_obj = %llu\n",
2627 (u_longlong_t)ds->ds_props_obj);
2628 (void) printf("\t\tbp = %s\n", blkbuf);
2629 }
2630
2631 static int
dump_bptree_cb(void * arg,const blkptr_t * bp,dmu_tx_t * tx)2632 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2633 {
2634 (void) arg, (void) tx;
2635 char blkbuf[BP_SPRINTF_LEN];
2636
2637 if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
2638 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2639 (void) printf("\t%s\n", blkbuf);
2640 }
2641 return (0);
2642 }
2643
2644 static void
dump_bptree(objset_t * os,uint64_t obj,const char * name)2645 dump_bptree(objset_t *os, uint64_t obj, const char *name)
2646 {
2647 char bytes[32];
2648 bptree_phys_t *bt;
2649 dmu_buf_t *db;
2650
2651 /* make sure nicenum has enough space */
2652 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2653
2654 if (dump_opt['d'] < 3)
2655 return;
2656
2657 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
2658 bt = db->db_data;
2659 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
2660 (void) printf("\n %s: %llu datasets, %s\n",
2661 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
2662 dmu_buf_rele(db, FTAG);
2663
2664 if (dump_opt['d'] < 5)
2665 return;
2666
2667 (void) printf("\n");
2668
2669 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
2670 }
2671
2672 static int
dump_bpobj_cb(void * arg,const blkptr_t * bp,boolean_t bp_freed,dmu_tx_t * tx)2673 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
2674 {
2675 (void) arg, (void) tx;
2676 char blkbuf[BP_SPRINTF_LEN];
2677
2678 ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
2679 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
2680 (void) printf("\t%s\n", blkbuf);
2681 return (0);
2682 }
2683
2684 static void
dump_full_bpobj(bpobj_t * bpo,const char * name,int indent)2685 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
2686 {
2687 char bytes[32];
2688 char comp[32];
2689 char uncomp[32];
2690 uint64_t i;
2691
2692 /* make sure nicenum has enough space */
2693 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2694 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
2695 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
2696
2697 if (dump_opt['d'] < 3)
2698 return;
2699
2700 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
2701 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2702 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
2703 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
2704 if (bpo->bpo_havefreed) {
2705 (void) printf(" %*s: object %llu, %llu local "
2706 "blkptrs, %llu freed, %llu subobjs in object %llu, "
2707 "%s (%s/%s comp)\n",
2708 indent * 8, name,
2709 (u_longlong_t)bpo->bpo_object,
2710 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2711 (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
2712 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
2713 (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
2714 bytes, comp, uncomp);
2715 } else {
2716 (void) printf(" %*s: object %llu, %llu local "
2717 "blkptrs, %llu subobjs in object %llu, "
2718 "%s (%s/%s comp)\n",
2719 indent * 8, name,
2720 (u_longlong_t)bpo->bpo_object,
2721 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2722 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
2723 (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
2724 bytes, comp, uncomp);
2725 }
2726
2727 for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2728 uint64_t subobj;
2729 bpobj_t subbpo;
2730 int error;
2731 VERIFY0(dmu_read(bpo->bpo_os,
2732 bpo->bpo_phys->bpo_subobjs,
2733 i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2734 error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2735 if (error != 0) {
2736 (void) printf("ERROR %u while trying to open "
2737 "subobj id %llu\n",
2738 error, (u_longlong_t)subobj);
2739 continue;
2740 }
2741 dump_full_bpobj(&subbpo, "subobj", indent + 1);
2742 bpobj_close(&subbpo);
2743 }
2744 } else {
2745 if (bpo->bpo_havefreed) {
2746 (void) printf(" %*s: object %llu, %llu blkptrs, "
2747 "%llu freed, %s\n",
2748 indent * 8, name,
2749 (u_longlong_t)bpo->bpo_object,
2750 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2751 (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
2752 bytes);
2753 } else {
2754 (void) printf(" %*s: object %llu, %llu blkptrs, "
2755 "%s\n",
2756 indent * 8, name,
2757 (u_longlong_t)bpo->bpo_object,
2758 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2759 bytes);
2760 }
2761 }
2762
2763 if (dump_opt['d'] < 5)
2764 return;
2765
2766
2767 if (indent == 0) {
2768 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
2769 (void) printf("\n");
2770 }
2771 }
2772
2773 static int
dump_bookmark(dsl_pool_t * dp,char * name,boolean_t print_redact,boolean_t print_list)2774 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
2775 boolean_t print_list)
2776 {
2777 int err = 0;
2778 zfs_bookmark_phys_t prop;
2779 objset_t *mos = dp->dp_spa->spa_meta_objset;
2780 err = dsl_bookmark_lookup(dp, name, NULL, &prop);
2781
2782 if (err != 0) {
2783 return (err);
2784 }
2785
2786 (void) printf("\t#%s: ", strchr(name, '#') + 1);
2787 (void) printf("{guid: %llx creation_txg: %llu creation_time: "
2788 "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
2789 (u_longlong_t)prop.zbm_creation_txg,
2790 (u_longlong_t)prop.zbm_creation_time,
2791 (u_longlong_t)prop.zbm_redaction_obj);
2792
2793 IMPLY(print_list, print_redact);
2794 if (!print_redact || prop.zbm_redaction_obj == 0)
2795 return (0);
2796
2797 redaction_list_t *rl;
2798 VERIFY0(dsl_redaction_list_hold_obj(dp,
2799 prop.zbm_redaction_obj, FTAG, &rl));
2800
2801 redaction_list_phys_t *rlp = rl->rl_phys;
2802 (void) printf("\tRedacted:\n\t\tProgress: ");
2803 if (rlp->rlp_last_object != UINT64_MAX ||
2804 rlp->rlp_last_blkid != UINT64_MAX) {
2805 (void) printf("%llu %llu (incomplete)\n",
2806 (u_longlong_t)rlp->rlp_last_object,
2807 (u_longlong_t)rlp->rlp_last_blkid);
2808 } else {
2809 (void) printf("complete\n");
2810 }
2811 (void) printf("\t\tSnapshots: [");
2812 for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
2813 if (i > 0)
2814 (void) printf(", ");
2815 (void) printf("%0llu",
2816 (u_longlong_t)rlp->rlp_snaps[i]);
2817 }
2818 (void) printf("]\n\t\tLength: %llu\n",
2819 (u_longlong_t)rlp->rlp_num_entries);
2820
2821 if (!print_list) {
2822 dsl_redaction_list_rele(rl, FTAG);
2823 return (0);
2824 }
2825
2826 if (rlp->rlp_num_entries == 0) {
2827 dsl_redaction_list_rele(rl, FTAG);
2828 (void) printf("\t\tRedaction List: []\n\n");
2829 return (0);
2830 }
2831
2832 redact_block_phys_t *rbp_buf;
2833 uint64_t size;
2834 dmu_object_info_t doi;
2835
2836 VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
2837 size = doi.doi_max_offset;
2838 rbp_buf = kmem_alloc(size, KM_SLEEP);
2839
2840 err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
2841 rbp_buf, 0);
2842 if (err != 0) {
2843 dsl_redaction_list_rele(rl, FTAG);
2844 kmem_free(rbp_buf, size);
2845 return (err);
2846 }
2847
2848 (void) printf("\t\tRedaction List: [{object: %llx, offset: "
2849 "%llx, blksz: %x, count: %llx}",
2850 (u_longlong_t)rbp_buf[0].rbp_object,
2851 (u_longlong_t)rbp_buf[0].rbp_blkid,
2852 (uint_t)(redact_block_get_size(&rbp_buf[0])),
2853 (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
2854
2855 for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
2856 (void) printf(",\n\t\t{object: %llx, offset: %llx, "
2857 "blksz: %x, count: %llx}",
2858 (u_longlong_t)rbp_buf[i].rbp_object,
2859 (u_longlong_t)rbp_buf[i].rbp_blkid,
2860 (uint_t)(redact_block_get_size(&rbp_buf[i])),
2861 (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
2862 }
2863 dsl_redaction_list_rele(rl, FTAG);
2864 kmem_free(rbp_buf, size);
2865 (void) printf("]\n\n");
2866 return (0);
2867 }
2868
2869 static void
dump_bookmarks(objset_t * os,int verbosity)2870 dump_bookmarks(objset_t *os, int verbosity)
2871 {
2872 zap_cursor_t zc;
2873 zap_attribute_t attr;
2874 dsl_dataset_t *ds = dmu_objset_ds(os);
2875 dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2876 objset_t *mos = os->os_spa->spa_meta_objset;
2877 if (verbosity < 4)
2878 return;
2879 dsl_pool_config_enter(dp, FTAG);
2880
2881 for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
2882 zap_cursor_retrieve(&zc, &attr) == 0;
2883 zap_cursor_advance(&zc)) {
2884 char osname[ZFS_MAX_DATASET_NAME_LEN];
2885 char buf[ZFS_MAX_DATASET_NAME_LEN];
2886 int len;
2887 dmu_objset_name(os, osname);
2888 len = snprintf(buf, sizeof (buf), "%s#%s", osname,
2889 attr.za_name);
2890 VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
2891 (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
2892 }
2893 zap_cursor_fini(&zc);
2894 dsl_pool_config_exit(dp, FTAG);
2895 }
2896
2897 static void
bpobj_count_refd(bpobj_t * bpo)2898 bpobj_count_refd(bpobj_t *bpo)
2899 {
2900 mos_obj_refd(bpo->bpo_object);
2901
2902 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2903 mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
2904 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2905 uint64_t subobj;
2906 bpobj_t subbpo;
2907 int error;
2908 VERIFY0(dmu_read(bpo->bpo_os,
2909 bpo->bpo_phys->bpo_subobjs,
2910 i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2911 error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2912 if (error != 0) {
2913 (void) printf("ERROR %u while trying to open "
2914 "subobj id %llu\n",
2915 error, (u_longlong_t)subobj);
2916 continue;
2917 }
2918 bpobj_count_refd(&subbpo);
2919 bpobj_close(&subbpo);
2920 }
2921 }
2922 }
2923
2924 static int
dsl_deadlist_entry_count_refd(void * arg,dsl_deadlist_entry_t * dle)2925 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
2926 {
2927 spa_t *spa = arg;
2928 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2929 if (dle->dle_bpobj.bpo_object != empty_bpobj)
2930 bpobj_count_refd(&dle->dle_bpobj);
2931 return (0);
2932 }
2933
2934 static int
dsl_deadlist_entry_dump(void * arg,dsl_deadlist_entry_t * dle)2935 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
2936 {
2937 ASSERT(arg == NULL);
2938 if (dump_opt['d'] >= 5) {
2939 char buf[128];
2940 (void) snprintf(buf, sizeof (buf),
2941 "mintxg %llu -> obj %llu",
2942 (longlong_t)dle->dle_mintxg,
2943 (longlong_t)dle->dle_bpobj.bpo_object);
2944
2945 dump_full_bpobj(&dle->dle_bpobj, buf, 0);
2946 } else {
2947 (void) printf("mintxg %llu -> obj %llu\n",
2948 (longlong_t)dle->dle_mintxg,
2949 (longlong_t)dle->dle_bpobj.bpo_object);
2950 }
2951 return (0);
2952 }
2953
2954 static void
dump_blkptr_list(dsl_deadlist_t * dl,const char * name)2955 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
2956 {
2957 char bytes[32];
2958 char comp[32];
2959 char uncomp[32];
2960 char entries[32];
2961 spa_t *spa = dmu_objset_spa(dl->dl_os);
2962 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2963
2964 if (dl->dl_oldfmt) {
2965 if (dl->dl_bpobj.bpo_object != empty_bpobj)
2966 bpobj_count_refd(&dl->dl_bpobj);
2967 } else {
2968 mos_obj_refd(dl->dl_object);
2969 dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
2970 }
2971
2972 /* make sure nicenum has enough space */
2973 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2974 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
2975 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
2976 _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
2977
2978 if (dump_opt['d'] < 3)
2979 return;
2980
2981 if (dl->dl_oldfmt) {
2982 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
2983 return;
2984 }
2985
2986 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
2987 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
2988 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
2989 zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
2990 (void) printf("\n %s: %s (%s/%s comp), %s entries\n",
2991 name, bytes, comp, uncomp, entries);
2992
2993 if (dump_opt['d'] < 4)
2994 return;
2995
2996 (void) putchar('\n');
2997
2998 dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
2999 }
3000
3001 static int
verify_dd_livelist(objset_t * os)3002 verify_dd_livelist(objset_t *os)
3003 {
3004 uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
3005 dsl_pool_t *dp = spa_get_dsl(os->os_spa);
3006 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
3007
3008 ASSERT(!dmu_objset_is_snapshot(os));
3009 if (!dsl_deadlist_is_open(&dd->dd_livelist))
3010 return (0);
3011
3012 /* Iterate through the livelist to check for duplicates */
3013 dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
3014 NULL);
3015
3016 dsl_pool_config_enter(dp, FTAG);
3017 dsl_deadlist_space(&dd->dd_livelist, &ll_used,
3018 &ll_comp, &ll_uncomp);
3019
3020 dsl_dataset_t *origin_ds;
3021 ASSERT(dsl_pool_config_held(dp));
3022 VERIFY0(dsl_dataset_hold_obj(dp,
3023 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
3024 VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
3025 &used, &comp, &uncomp));
3026 dsl_dataset_rele(origin_ds, FTAG);
3027 dsl_pool_config_exit(dp, FTAG);
3028 /*
3029 * It's possible that the dataset's uncomp space is larger than the
3030 * livelist's because livelists do not track embedded block pointers
3031 */
3032 if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
3033 char nice_used[32], nice_comp[32], nice_uncomp[32];
3034 (void) printf("Discrepancy in space accounting:\n");
3035 zdb_nicenum(used, nice_used, sizeof (nice_used));
3036 zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
3037 zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
3038 (void) printf("dir: used %s, comp %s, uncomp %s\n",
3039 nice_used, nice_comp, nice_uncomp);
3040 zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
3041 zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
3042 zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
3043 (void) printf("livelist: used %s, comp %s, uncomp %s\n",
3044 nice_used, nice_comp, nice_uncomp);
3045 return (1);
3046 }
3047 return (0);
3048 }
3049
3050 static char *key_material = NULL;
3051
3052 static boolean_t
zdb_derive_key(dsl_dir_t * dd,uint8_t * key_out)3053 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
3054 {
3055 uint64_t keyformat, salt, iters;
3056 int i;
3057 unsigned char c;
3058
3059 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
3060 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
3061 1, &keyformat));
3062
3063 switch (keyformat) {
3064 case ZFS_KEYFORMAT_HEX:
3065 for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
3066 if (!isxdigit(key_material[i]) ||
3067 !isxdigit(key_material[i+1]))
3068 return (B_FALSE);
3069 if (sscanf(&key_material[i], "%02hhx", &c) != 1)
3070 return (B_FALSE);
3071 key_out[i / 2] = c;
3072 }
3073 break;
3074
3075 case ZFS_KEYFORMAT_PASSPHRASE:
3076 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
3077 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
3078 sizeof (uint64_t), 1, &salt));
3079 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
3080 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
3081 sizeof (uint64_t), 1, &iters));
3082
3083 if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
3084 ((uint8_t *)&salt), sizeof (uint64_t), iters,
3085 WRAPPING_KEY_LEN, key_out) != 1)
3086 return (B_FALSE);
3087
3088 break;
3089
3090 default:
3091 fatal("no support for key format %u\n",
3092 (unsigned int) keyformat);
3093 }
3094
3095 return (B_TRUE);
3096 }
3097
3098 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
3099 static boolean_t key_loaded = B_FALSE;
3100
3101 static void
zdb_load_key(objset_t * os)3102 zdb_load_key(objset_t *os)
3103 {
3104 dsl_pool_t *dp;
3105 dsl_dir_t *dd, *rdd;
3106 uint8_t key[WRAPPING_KEY_LEN];
3107 uint64_t rddobj;
3108 int err;
3109
3110 dp = spa_get_dsl(os->os_spa);
3111 dd = os->os_dsl_dataset->ds_dir;
3112
3113 dsl_pool_config_enter(dp, FTAG);
3114 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
3115 DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
3116 VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
3117 dsl_dir_name(rdd, encroot);
3118 dsl_dir_rele(rdd, FTAG);
3119
3120 if (!zdb_derive_key(dd, key))
3121 fatal("couldn't derive encryption key");
3122
3123 dsl_pool_config_exit(dp, FTAG);
3124
3125 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
3126
3127 dsl_crypto_params_t *dcp;
3128 nvlist_t *crypto_args;
3129
3130 crypto_args = fnvlist_alloc();
3131 fnvlist_add_uint8_array(crypto_args, "wkeydata",
3132 (uint8_t *)key, WRAPPING_KEY_LEN);
3133 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
3134 NULL, crypto_args, &dcp));
3135 err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
3136
3137 dsl_crypto_params_free(dcp, (err != 0));
3138 fnvlist_free(crypto_args);
3139
3140 if (err != 0)
3141 fatal(
3142 "couldn't load encryption key for %s: %s",
3143 encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
3144 "crypto params not supported" : strerror(err));
3145
3146 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
3147
3148 printf("Unlocked encryption root: %s\n", encroot);
3149 key_loaded = B_TRUE;
3150 }
3151
3152 static void
zdb_unload_key(void)3153 zdb_unload_key(void)
3154 {
3155 if (!key_loaded)
3156 return;
3157
3158 VERIFY0(spa_keystore_unload_wkey(encroot));
3159 key_loaded = B_FALSE;
3160 }
3161
3162 static avl_tree_t idx_tree;
3163 static avl_tree_t domain_tree;
3164 static boolean_t fuid_table_loaded;
3165 static objset_t *sa_os = NULL;
3166 static sa_attr_type_t *sa_attr_table = NULL;
3167
3168 static int
open_objset(const char * path,const void * tag,objset_t ** osp)3169 open_objset(const char *path, const void *tag, objset_t **osp)
3170 {
3171 int err;
3172 uint64_t sa_attrs = 0;
3173 uint64_t version = 0;
3174
3175 VERIFY3P(sa_os, ==, NULL);
3176
3177 /*
3178 * We can't own an objset if it's redacted. Therefore, we do this
3179 * dance: hold the objset, then acquire a long hold on its dataset, then
3180 * release the pool (which is held as part of holding the objset).
3181 */
3182
3183 if (dump_opt['K']) {
3184 /* decryption requested, try to load keys */
3185 err = dmu_objset_hold(path, tag, osp);
3186 if (err != 0) {
3187 (void) fprintf(stderr, "failed to hold dataset "
3188 "'%s': %s\n",
3189 path, strerror(err));
3190 return (err);
3191 }
3192 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
3193 dsl_pool_rele(dmu_objset_pool(*osp), tag);
3194
3195 /* succeeds or dies */
3196 zdb_load_key(*osp);
3197
3198 /* release it all */
3199 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
3200 dsl_dataset_rele(dmu_objset_ds(*osp), tag);
3201 }
3202
3203 int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
3204
3205 err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
3206 if (err != 0) {
3207 (void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
3208 path, strerror(err));
3209 return (err);
3210 }
3211 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
3212 dsl_pool_rele(dmu_objset_pool(*osp), tag);
3213
3214 if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
3215 (key_loaded || !(*osp)->os_encrypted)) {
3216 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
3217 8, 1, &version);
3218 if (version >= ZPL_VERSION_SA) {
3219 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
3220 8, 1, &sa_attrs);
3221 }
3222 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
3223 &sa_attr_table);
3224 if (err != 0) {
3225 (void) fprintf(stderr, "sa_setup failed: %s\n",
3226 strerror(err));
3227 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
3228 dsl_dataset_rele_flags(dmu_objset_ds(*osp),
3229 ds_hold_flags, tag);
3230 *osp = NULL;
3231 }
3232 }
3233 sa_os = *osp;
3234
3235 return (err);
3236 }
3237
3238 static void
close_objset(objset_t * os,const void * tag)3239 close_objset(objset_t *os, const void *tag)
3240 {
3241 VERIFY3P(os, ==, sa_os);
3242 if (os->os_sa != NULL)
3243 sa_tear_down(os);
3244 dsl_dataset_long_rele(dmu_objset_ds(os), tag);
3245 dsl_dataset_rele_flags(dmu_objset_ds(os),
3246 key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
3247 sa_attr_table = NULL;
3248 sa_os = NULL;
3249
3250 zdb_unload_key();
3251 }
3252
3253 static void
fuid_table_destroy(void)3254 fuid_table_destroy(void)
3255 {
3256 if (fuid_table_loaded) {
3257 zfs_fuid_table_destroy(&idx_tree, &domain_tree);
3258 fuid_table_loaded = B_FALSE;
3259 }
3260 }
3261
3262 static void
zdb_exit(int reason)3263 zdb_exit(int reason)
3264 {
3265 if (os != NULL) {
3266 close_objset(os, FTAG);
3267 } else if (spa != NULL) {
3268 spa_close(spa, FTAG);
3269 }
3270
3271 fuid_table_destroy();
3272
3273 if (kernel_init_done)
3274 kernel_fini();
3275
3276 exit(reason);
3277 }
3278
3279 /*
3280 * print uid or gid information.
3281 * For normal POSIX id just the id is printed in decimal format.
3282 * For CIFS files with FUID the fuid is printed in hex followed by
3283 * the domain-rid string.
3284 */
3285 static void
print_idstr(uint64_t id,const char * id_type)3286 print_idstr(uint64_t id, const char *id_type)
3287 {
3288 if (FUID_INDEX(id)) {
3289 const char *domain =
3290 zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
3291 (void) printf("\t%s %llx [%s-%d]\n", id_type,
3292 (u_longlong_t)id, domain, (int)FUID_RID(id));
3293 } else {
3294 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
3295 }
3296
3297 }
3298
3299 static void
dump_uidgid(objset_t * os,uint64_t uid,uint64_t gid)3300 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
3301 {
3302 uint32_t uid_idx, gid_idx;
3303
3304 uid_idx = FUID_INDEX(uid);
3305 gid_idx = FUID_INDEX(gid);
3306
3307 /* Load domain table, if not already loaded */
3308 if (!fuid_table_loaded && (uid_idx || gid_idx)) {
3309 uint64_t fuid_obj;
3310
3311 /* first find the fuid object. It lives in the master node */
3312 VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
3313 8, 1, &fuid_obj) == 0);
3314 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
3315 (void) zfs_fuid_table_load(os, fuid_obj,
3316 &idx_tree, &domain_tree);
3317 fuid_table_loaded = B_TRUE;
3318 }
3319
3320 print_idstr(uid, "uid");
3321 print_idstr(gid, "gid");
3322 }
3323
3324 static void
dump_znode_sa_xattr(sa_handle_t * hdl)3325 dump_znode_sa_xattr(sa_handle_t *hdl)
3326 {
3327 nvlist_t *sa_xattr;
3328 nvpair_t *elem = NULL;
3329 int sa_xattr_size = 0;
3330 int sa_xattr_entries = 0;
3331 int error;
3332 char *sa_xattr_packed;
3333
3334 error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
3335 if (error || sa_xattr_size == 0)
3336 return;
3337
3338 sa_xattr_packed = malloc(sa_xattr_size);
3339 if (sa_xattr_packed == NULL)
3340 return;
3341
3342 error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
3343 sa_xattr_packed, sa_xattr_size);
3344 if (error) {
3345 free(sa_xattr_packed);
3346 return;
3347 }
3348
3349 error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
3350 if (error) {
3351 free(sa_xattr_packed);
3352 return;
3353 }
3354
3355 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
3356 sa_xattr_entries++;
3357
3358 (void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
3359 sa_xattr_size, sa_xattr_entries);
3360 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
3361 boolean_t can_print = !dump_opt['P'];
3362 uchar_t *value;
3363 uint_t cnt, idx;
3364
3365 (void) printf("\t\t%s = ", nvpair_name(elem));
3366 nvpair_value_byte_array(elem, &value, &cnt);
3367
3368 for (idx = 0; idx < cnt; ++idx) {
3369 if (!isprint(value[idx])) {
3370 can_print = B_FALSE;
3371 break;
3372 }
3373 }
3374
3375 for (idx = 0; idx < cnt; ++idx) {
3376 if (can_print)
3377 (void) putchar(value[idx]);
3378 else
3379 (void) printf("\\%3.3o", value[idx]);
3380 }
3381 (void) putchar('\n');
3382 }
3383
3384 nvlist_free(sa_xattr);
3385 free(sa_xattr_packed);
3386 }
3387
3388 static void
dump_znode_symlink(sa_handle_t * hdl)3389 dump_znode_symlink(sa_handle_t *hdl)
3390 {
3391 int sa_symlink_size = 0;
3392 char linktarget[MAXPATHLEN];
3393 int error;
3394
3395 error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
3396 if (error || sa_symlink_size == 0) {
3397 return;
3398 }
3399 if (sa_symlink_size >= sizeof (linktarget)) {
3400 (void) printf("symlink size %d is too large\n",
3401 sa_symlink_size);
3402 return;
3403 }
3404 linktarget[sa_symlink_size] = '\0';
3405 if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
3406 &linktarget, sa_symlink_size) == 0)
3407 (void) printf("\ttarget %s\n", linktarget);
3408 }
3409
3410 static void
dump_znode(objset_t * os,uint64_t object,void * data,size_t size)3411 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
3412 {
3413 (void) data, (void) size;
3414 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
3415 sa_handle_t *hdl;
3416 uint64_t xattr, rdev, gen;
3417 uint64_t uid, gid, mode, fsize, parent, links;
3418 uint64_t pflags;
3419 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
3420 time_t z_crtime, z_atime, z_mtime, z_ctime;
3421 sa_bulk_attr_t bulk[12];
3422 int idx = 0;
3423 int error;
3424
3425 VERIFY3P(os, ==, sa_os);
3426 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
3427 (void) printf("Failed to get handle for SA znode\n");
3428 return;
3429 }
3430
3431 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
3432 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
3433 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
3434 &links, 8);
3435 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
3436 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
3437 &mode, 8);
3438 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
3439 NULL, &parent, 8);
3440 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
3441 &fsize, 8);
3442 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
3443 acctm, 16);
3444 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
3445 modtm, 16);
3446 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
3447 crtm, 16);
3448 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
3449 chgtm, 16);
3450 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
3451 &pflags, 8);
3452
3453 if (sa_bulk_lookup(hdl, bulk, idx)) {
3454 (void) sa_handle_destroy(hdl);
3455 return;
3456 }
3457
3458 z_crtime = (time_t)crtm[0];
3459 z_atime = (time_t)acctm[0];
3460 z_mtime = (time_t)modtm[0];
3461 z_ctime = (time_t)chgtm[0];
3462
3463 if (dump_opt['d'] > 4) {
3464 error = zfs_obj_to_path(os, object, path, sizeof (path));
3465 if (error == ESTALE) {
3466 (void) snprintf(path, sizeof (path), "on delete queue");
3467 } else if (error != 0) {
3468 leaked_objects++;
3469 (void) snprintf(path, sizeof (path),
3470 "path not found, possibly leaked");
3471 }
3472 (void) printf("\tpath %s\n", path);
3473 }
3474
3475 if (S_ISLNK(mode))
3476 dump_znode_symlink(hdl);
3477 dump_uidgid(os, uid, gid);
3478 (void) printf("\tatime %s", ctime(&z_atime));
3479 (void) printf("\tmtime %s", ctime(&z_mtime));
3480 (void) printf("\tctime %s", ctime(&z_ctime));
3481 (void) printf("\tcrtime %s", ctime(&z_crtime));
3482 (void) printf("\tgen %llu\n", (u_longlong_t)gen);
3483 (void) printf("\tmode %llo\n", (u_longlong_t)mode);
3484 (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
3485 (void) printf("\tparent %llu\n", (u_longlong_t)parent);
3486 (void) printf("\tlinks %llu\n", (u_longlong_t)links);
3487 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
3488 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
3489 uint64_t projid;
3490
3491 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
3492 sizeof (uint64_t)) == 0)
3493 (void) printf("\tprojid %llu\n", (u_longlong_t)projid);
3494 }
3495 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
3496 sizeof (uint64_t)) == 0)
3497 (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
3498 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
3499 sizeof (uint64_t)) == 0)
3500 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
3501 dump_znode_sa_xattr(hdl);
3502 sa_handle_destroy(hdl);
3503 }
3504
3505 static void
dump_acl(objset_t * os,uint64_t object,void * data,size_t size)3506 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
3507 {
3508 (void) os, (void) object, (void) data, (void) size;
3509 }
3510
3511 static void
dump_dmu_objset(objset_t * os,uint64_t object,void * data,size_t size)3512 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
3513 {
3514 (void) os, (void) object, (void) data, (void) size;
3515 }
3516
3517 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
3518 dump_none, /* unallocated */
3519 dump_zap, /* object directory */
3520 dump_uint64, /* object array */
3521 dump_none, /* packed nvlist */
3522 dump_packed_nvlist, /* packed nvlist size */
3523 dump_none, /* bpobj */
3524 dump_bpobj, /* bpobj header */
3525 dump_none, /* SPA space map header */
3526 dump_none, /* SPA space map */
3527 dump_none, /* ZIL intent log */
3528 dump_dnode, /* DMU dnode */
3529 dump_dmu_objset, /* DMU objset */
3530 dump_dsl_dir, /* DSL directory */
3531 dump_zap, /* DSL directory child map */
3532 dump_zap, /* DSL dataset snap map */
3533 dump_zap, /* DSL props */
3534 dump_dsl_dataset, /* DSL dataset */
3535 dump_znode, /* ZFS znode */
3536 dump_acl, /* ZFS V0 ACL */
3537 dump_uint8, /* ZFS plain file */
3538 dump_zpldir, /* ZFS directory */
3539 dump_zap, /* ZFS master node */
3540 dump_zap, /* ZFS delete queue */
3541 dump_uint8, /* zvol object */
3542 dump_zap, /* zvol prop */
3543 dump_uint8, /* other uint8[] */
3544 dump_uint64, /* other uint64[] */
3545 dump_zap, /* other ZAP */
3546 dump_zap, /* persistent error log */
3547 dump_uint8, /* SPA history */
3548 dump_history_offsets, /* SPA history offsets */
3549 dump_zap, /* Pool properties */
3550 dump_zap, /* DSL permissions */
3551 dump_acl, /* ZFS ACL */
3552 dump_uint8, /* ZFS SYSACL */
3553 dump_none, /* FUID nvlist */
3554 dump_packed_nvlist, /* FUID nvlist size */
3555 dump_zap, /* DSL dataset next clones */
3556 dump_zap, /* DSL scrub queue */
3557 dump_zap, /* ZFS user/group/project used */
3558 dump_zap, /* ZFS user/group/project quota */
3559 dump_zap, /* snapshot refcount tags */
3560 dump_ddt_zap, /* DDT ZAP object */
3561 dump_zap, /* DDT statistics */
3562 dump_znode, /* SA object */
3563 dump_zap, /* SA Master Node */
3564 dump_sa_attrs, /* SA attribute registration */
3565 dump_sa_layouts, /* SA attribute layouts */
3566 dump_zap, /* DSL scrub translations */
3567 dump_none, /* fake dedup BP */
3568 dump_zap, /* deadlist */
3569 dump_none, /* deadlist hdr */
3570 dump_zap, /* dsl clones */
3571 dump_bpobj_subobjs, /* bpobj subobjs */
3572 dump_unknown, /* Unknown type, must be last */
3573 };
3574
3575 static boolean_t
match_object_type(dmu_object_type_t obj_type,uint64_t flags)3576 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
3577 {
3578 boolean_t match = B_TRUE;
3579
3580 switch (obj_type) {
3581 case DMU_OT_DIRECTORY_CONTENTS:
3582 if (!(flags & ZOR_FLAG_DIRECTORY))
3583 match = B_FALSE;
3584 break;
3585 case DMU_OT_PLAIN_FILE_CONTENTS:
3586 if (!(flags & ZOR_FLAG_PLAIN_FILE))
3587 match = B_FALSE;
3588 break;
3589 case DMU_OT_SPACE_MAP:
3590 if (!(flags & ZOR_FLAG_SPACE_MAP))
3591 match = B_FALSE;
3592 break;
3593 default:
3594 if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
3595 if (!(flags & ZOR_FLAG_ZAP))
3596 match = B_FALSE;
3597 break;
3598 }
3599
3600 /*
3601 * If all bits except some of the supported flags are
3602 * set, the user combined the all-types flag (A) with
3603 * a negated flag to exclude some types (e.g. A-f to
3604 * show all object types except plain files).
3605 */
3606 if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
3607 match = B_FALSE;
3608
3609 break;
3610 }
3611
3612 return (match);
3613 }
3614
3615 static void
dump_object(objset_t * os,uint64_t object,int verbosity,boolean_t * print_header,uint64_t * dnode_slots_used,uint64_t flags)3616 dump_object(objset_t *os, uint64_t object, int verbosity,
3617 boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
3618 {
3619 dmu_buf_t *db = NULL;
3620 dmu_object_info_t doi;
3621 dnode_t *dn;
3622 boolean_t dnode_held = B_FALSE;
3623 void *bonus = NULL;
3624 size_t bsize = 0;
3625 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
3626 char bonus_size[32];
3627 char aux[50];
3628 int error;
3629
3630 /* make sure nicenum has enough space */
3631 _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
3632 _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
3633 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
3634 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
3635 _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
3636 "bonus_size truncated");
3637
3638 if (*print_header) {
3639 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
3640 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
3641 "lsize", "%full", "type");
3642 *print_header = 0;
3643 }
3644
3645 if (object == 0) {
3646 dn = DMU_META_DNODE(os);
3647 dmu_object_info_from_dnode(dn, &doi);
3648 } else {
3649 /*
3650 * Encrypted datasets will have sensitive bonus buffers
3651 * encrypted. Therefore we cannot hold the bonus buffer and
3652 * must hold the dnode itself instead.
3653 */
3654 error = dmu_object_info(os, object, &doi);
3655 if (error)
3656 fatal("dmu_object_info() failed, errno %u", error);
3657
3658 if (!key_loaded && os->os_encrypted &&
3659 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
3660 error = dnode_hold(os, object, FTAG, &dn);
3661 if (error)
3662 fatal("dnode_hold() failed, errno %u", error);
3663 dnode_held = B_TRUE;
3664 } else {
3665 error = dmu_bonus_hold(os, object, FTAG, &db);
3666 if (error)
3667 fatal("dmu_bonus_hold(%llu) failed, errno %u",
3668 object, error);
3669 bonus = db->db_data;
3670 bsize = db->db_size;
3671 dn = DB_DNODE((dmu_buf_impl_t *)db);
3672 }
3673 }
3674
3675 /*
3676 * Default to showing all object types if no flags were specified.
3677 */
3678 if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
3679 !match_object_type(doi.doi_type, flags))
3680 goto out;
3681
3682 if (dnode_slots_used)
3683 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
3684
3685 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
3686 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
3687 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
3688 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
3689 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
3690 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
3691 (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
3692 doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
3693 DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
3694
3695 aux[0] = '\0';
3696
3697 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
3698 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3699 " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
3700 }
3701
3702 if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
3703 ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
3704 const char *compname = NULL;
3705 if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
3706 ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
3707 &compname) == 0) {
3708 (void) snprintf(aux + strlen(aux),
3709 sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
3710 compname);
3711 } else {
3712 (void) snprintf(aux + strlen(aux),
3713 sizeof (aux) - strlen(aux),
3714 " (Z=inherit=%s-unknown)",
3715 ZDB_COMPRESS_NAME(os->os_compress));
3716 }
3717 } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
3718 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3719 " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
3720 } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
3721 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3722 " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
3723 }
3724
3725 (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
3726 (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
3727 asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
3728
3729 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
3730 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
3731 "", "", "", "", "", "", bonus_size, "bonus",
3732 zdb_ot_name(doi.doi_bonus_type));
3733 }
3734
3735 if (verbosity >= 4) {
3736 (void) printf("\tdnode flags: %s%s%s%s\n",
3737 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
3738 "USED_BYTES " : "",
3739 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
3740 "USERUSED_ACCOUNTED " : "",
3741 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
3742 "USEROBJUSED_ACCOUNTED " : "",
3743 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
3744 "SPILL_BLKPTR" : "");
3745 (void) printf("\tdnode maxblkid: %llu\n",
3746 (longlong_t)dn->dn_phys->dn_maxblkid);
3747
3748 if (!dnode_held) {
3749 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
3750 object, bonus, bsize);
3751 } else {
3752 (void) printf("\t\t(bonus encrypted)\n");
3753 }
3754
3755 if (key_loaded ||
3756 (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
3757 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
3758 NULL, 0);
3759 } else {
3760 (void) printf("\t\t(object encrypted)\n");
3761 }
3762
3763 *print_header = B_TRUE;
3764 }
3765
3766 if (verbosity >= 5) {
3767 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
3768 char blkbuf[BP_SPRINTF_LEN];
3769 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
3770 DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
3771 (void) printf("\nSpill block: %s\n", blkbuf);
3772 }
3773 dump_indirect(dn);
3774 }
3775
3776 if (verbosity >= 5) {
3777 /*
3778 * Report the list of segments that comprise the object.
3779 */
3780 uint64_t start = 0;
3781 uint64_t end;
3782 uint64_t blkfill = 1;
3783 int minlvl = 1;
3784
3785 if (dn->dn_type == DMU_OT_DNODE) {
3786 minlvl = 0;
3787 blkfill = DNODES_PER_BLOCK;
3788 }
3789
3790 for (;;) {
3791 char segsize[32];
3792 /* make sure nicenum has enough space */
3793 _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
3794 "segsize truncated");
3795 error = dnode_next_offset(dn,
3796 0, &start, minlvl, blkfill, 0);
3797 if (error)
3798 break;
3799 end = start;
3800 error = dnode_next_offset(dn,
3801 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
3802 zdb_nicenum(end - start, segsize, sizeof (segsize));
3803 (void) printf("\t\tsegment [%016llx, %016llx)"
3804 " size %5s\n", (u_longlong_t)start,
3805 (u_longlong_t)end, segsize);
3806 if (error)
3807 break;
3808 start = end;
3809 }
3810 }
3811
3812 out:
3813 if (db != NULL)
3814 dmu_buf_rele(db, FTAG);
3815 if (dnode_held)
3816 dnode_rele(dn, FTAG);
3817 }
3818
3819 static void
count_dir_mos_objects(dsl_dir_t * dd)3820 count_dir_mos_objects(dsl_dir_t *dd)
3821 {
3822 mos_obj_refd(dd->dd_object);
3823 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
3824 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
3825 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
3826 mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
3827
3828 /*
3829 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
3830 * Ignore the references after the first one.
3831 */
3832 mos_obj_refd_multiple(dd->dd_crypto_obj);
3833 }
3834
3835 static void
count_ds_mos_objects(dsl_dataset_t * ds)3836 count_ds_mos_objects(dsl_dataset_t *ds)
3837 {
3838 mos_obj_refd(ds->ds_object);
3839 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
3840 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
3841 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
3842 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
3843 mos_obj_refd(ds->ds_bookmarks_obj);
3844
3845 if (!dsl_dataset_is_snapshot(ds)) {
3846 count_dir_mos_objects(ds->ds_dir);
3847 }
3848 }
3849
3850 static const char *const objset_types[DMU_OST_NUMTYPES] = {
3851 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
3852
3853 /*
3854 * Parse a string denoting a range of object IDs of the form
3855 * <start>[:<end>[:flags]], and store the results in zor.
3856 * Return 0 on success. On error, return 1 and update the msg
3857 * pointer to point to a descriptive error message.
3858 */
3859 static int
parse_object_range(char * range,zopt_object_range_t * zor,const char ** msg)3860 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
3861 {
3862 uint64_t flags = 0;
3863 char *p, *s, *dup, *flagstr, *tmp = NULL;
3864 size_t len;
3865 int i;
3866 int rc = 0;
3867
3868 if (strchr(range, ':') == NULL) {
3869 zor->zor_obj_start = strtoull(range, &p, 0);
3870 if (*p != '\0') {
3871 *msg = "Invalid characters in object ID";
3872 rc = 1;
3873 }
3874 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
3875 zor->zor_obj_end = zor->zor_obj_start;
3876 return (rc);
3877 }
3878
3879 if (strchr(range, ':') == range) {
3880 *msg = "Invalid leading colon";
3881 rc = 1;
3882 return (rc);
3883 }
3884
3885 len = strlen(range);
3886 if (range[len - 1] == ':') {
3887 *msg = "Invalid trailing colon";
3888 rc = 1;
3889 return (rc);
3890 }
3891
3892 dup = strdup(range);
3893 s = strtok_r(dup, ":", &tmp);
3894 zor->zor_obj_start = strtoull(s, &p, 0);
3895
3896 if (*p != '\0') {
3897 *msg = "Invalid characters in start object ID";
3898 rc = 1;
3899 goto out;
3900 }
3901
3902 s = strtok_r(NULL, ":", &tmp);
3903 zor->zor_obj_end = strtoull(s, &p, 0);
3904
3905 if (*p != '\0') {
3906 *msg = "Invalid characters in end object ID";
3907 rc = 1;
3908 goto out;
3909 }
3910
3911 if (zor->zor_obj_start > zor->zor_obj_end) {
3912 *msg = "Start object ID may not exceed end object ID";
3913 rc = 1;
3914 goto out;
3915 }
3916
3917 s = strtok_r(NULL, ":", &tmp);
3918 if (s == NULL) {
3919 zor->zor_flags = ZOR_FLAG_ALL_TYPES;
3920 goto out;
3921 } else if (strtok_r(NULL, ":", &tmp) != NULL) {
3922 *msg = "Invalid colon-delimited field after flags";
3923 rc = 1;
3924 goto out;
3925 }
3926
3927 flagstr = s;
3928 for (i = 0; flagstr[i]; i++) {
3929 int bit;
3930 boolean_t negation = (flagstr[i] == '-');
3931
3932 if (negation) {
3933 i++;
3934 if (flagstr[i] == '\0') {
3935 *msg = "Invalid trailing negation operator";
3936 rc = 1;
3937 goto out;
3938 }
3939 }
3940 bit = flagbits[(uchar_t)flagstr[i]];
3941 if (bit == 0) {
3942 *msg = "Invalid flag";
3943 rc = 1;
3944 goto out;
3945 }
3946 if (negation)
3947 flags &= ~bit;
3948 else
3949 flags |= bit;
3950 }
3951 zor->zor_flags = flags;
3952
3953 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
3954 zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
3955
3956 out:
3957 free(dup);
3958 return (rc);
3959 }
3960
3961 static void
dump_objset(objset_t * os)3962 dump_objset(objset_t *os)
3963 {
3964 dmu_objset_stats_t dds = { 0 };
3965 uint64_t object, object_count;
3966 uint64_t refdbytes, usedobjs, scratch;
3967 char numbuf[32];
3968 char blkbuf[BP_SPRINTF_LEN + 20];
3969 char osname[ZFS_MAX_DATASET_NAME_LEN];
3970 const char *type = "UNKNOWN";
3971 int verbosity = dump_opt['d'];
3972 boolean_t print_header;
3973 unsigned i;
3974 int error;
3975 uint64_t total_slots_used = 0;
3976 uint64_t max_slot_used = 0;
3977 uint64_t dnode_slots;
3978 uint64_t obj_start;
3979 uint64_t obj_end;
3980 uint64_t flags;
3981
3982 /* make sure nicenum has enough space */
3983 _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
3984
3985 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
3986 dmu_objset_fast_stat(os, &dds);
3987 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
3988
3989 print_header = B_TRUE;
3990
3991 if (dds.dds_type < DMU_OST_NUMTYPES)
3992 type = objset_types[dds.dds_type];
3993
3994 if (dds.dds_type == DMU_OST_META) {
3995 dds.dds_creation_txg = TXG_INITIAL;
3996 usedobjs = BP_GET_FILL(os->os_rootbp);
3997 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
3998 dd_used_bytes;
3999 } else {
4000 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
4001 }
4002
4003 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
4004
4005 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
4006
4007 if (verbosity >= 4) {
4008 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
4009 (void) snprintf_blkptr(blkbuf + strlen(blkbuf),
4010 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
4011 } else {
4012 blkbuf[0] = '\0';
4013 }
4014
4015 dmu_objset_name(os, osname);
4016
4017 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
4018 "%s, %llu objects%s%s\n",
4019 osname, type, (u_longlong_t)dmu_objset_id(os),
4020 (u_longlong_t)dds.dds_creation_txg,
4021 numbuf, (u_longlong_t)usedobjs, blkbuf,
4022 (dds.dds_inconsistent) ? " (inconsistent)" : "");
4023
4024 for (i = 0; i < zopt_object_args; i++) {
4025 obj_start = zopt_object_ranges[i].zor_obj_start;
4026 obj_end = zopt_object_ranges[i].zor_obj_end;
4027 flags = zopt_object_ranges[i].zor_flags;
4028
4029 object = obj_start;
4030 if (object == 0 || obj_start == obj_end)
4031 dump_object(os, object, verbosity, &print_header, NULL,
4032 flags);
4033 else
4034 object--;
4035
4036 while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
4037 object <= obj_end) {
4038 dump_object(os, object, verbosity, &print_header, NULL,
4039 flags);
4040 }
4041 }
4042
4043 if (zopt_object_args > 0) {
4044 (void) printf("\n");
4045 return;
4046 }
4047
4048 if (dump_opt['i'] != 0 || verbosity >= 2)
4049 dump_intent_log(dmu_objset_zil(os));
4050
4051 if (dmu_objset_ds(os) != NULL) {
4052 dsl_dataset_t *ds = dmu_objset_ds(os);
4053 dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
4054 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
4055 !dmu_objset_is_snapshot(os)) {
4056 dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
4057 if (verify_dd_livelist(os) != 0)
4058 fatal("livelist is incorrect");
4059 }
4060
4061 if (dsl_dataset_remap_deadlist_exists(ds)) {
4062 (void) printf("ds_remap_deadlist:\n");
4063 dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
4064 }
4065 count_ds_mos_objects(ds);
4066 }
4067
4068 if (dmu_objset_ds(os) != NULL)
4069 dump_bookmarks(os, verbosity);
4070
4071 if (verbosity < 2)
4072 return;
4073
4074 if (BP_IS_HOLE(os->os_rootbp))
4075 return;
4076
4077 dump_object(os, 0, verbosity, &print_header, NULL, 0);
4078 object_count = 0;
4079 if (DMU_USERUSED_DNODE(os) != NULL &&
4080 DMU_USERUSED_DNODE(os)->dn_type != 0) {
4081 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
4082 NULL, 0);
4083 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
4084 NULL, 0);
4085 }
4086
4087 if (DMU_PROJECTUSED_DNODE(os) != NULL &&
4088 DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
4089 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
4090 &print_header, NULL, 0);
4091
4092 object = 0;
4093 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
4094 dump_object(os, object, verbosity, &print_header, &dnode_slots,
4095 0);
4096 object_count++;
4097 total_slots_used += dnode_slots;
4098 max_slot_used = object + dnode_slots - 1;
4099 }
4100
4101 (void) printf("\n");
4102
4103 (void) printf(" Dnode slots:\n");
4104 (void) printf("\tTotal used: %10llu\n",
4105 (u_longlong_t)total_slots_used);
4106 (void) printf("\tMax used: %10llu\n",
4107 (u_longlong_t)max_slot_used);
4108 (void) printf("\tPercent empty: %10lf\n",
4109 (double)(max_slot_used - total_slots_used)*100 /
4110 (double)max_slot_used);
4111 (void) printf("\n");
4112
4113 if (error != ESRCH) {
4114 (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
4115 abort();
4116 }
4117
4118 ASSERT3U(object_count, ==, usedobjs);
4119
4120 if (leaked_objects != 0) {
4121 (void) printf("%d potentially leaked objects detected\n",
4122 leaked_objects);
4123 leaked_objects = 0;
4124 }
4125 }
4126
4127 static void
dump_uberblock(uberblock_t * ub,const char * header,const char * footer)4128 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
4129 {
4130 time_t timestamp = ub->ub_timestamp;
4131
4132 (void) printf("%s", header ? header : "");
4133 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
4134 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
4135 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
4136 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
4137 (void) printf("\ttimestamp = %llu UTC = %s",
4138 (u_longlong_t)ub->ub_timestamp, ctime(×tamp));
4139
4140 (void) printf("\tmmp_magic = %016llx\n",
4141 (u_longlong_t)ub->ub_mmp_magic);
4142 if (MMP_VALID(ub)) {
4143 (void) printf("\tmmp_delay = %0llu\n",
4144 (u_longlong_t)ub->ub_mmp_delay);
4145 if (MMP_SEQ_VALID(ub))
4146 (void) printf("\tmmp_seq = %u\n",
4147 (unsigned int) MMP_SEQ(ub));
4148 if (MMP_FAIL_INT_VALID(ub))
4149 (void) printf("\tmmp_fail = %u\n",
4150 (unsigned int) MMP_FAIL_INT(ub));
4151 if (MMP_INTERVAL_VALID(ub))
4152 (void) printf("\tmmp_write = %u\n",
4153 (unsigned int) MMP_INTERVAL(ub));
4154 /* After MMP_* to make summarize_uberblock_mmp cleaner */
4155 (void) printf("\tmmp_valid = %x\n",
4156 (unsigned int) ub->ub_mmp_config & 0xFF);
4157 }
4158
4159 if (dump_opt['u'] >= 4) {
4160 char blkbuf[BP_SPRINTF_LEN];
4161 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
4162 (void) printf("\trootbp = %s\n", blkbuf);
4163 }
4164 (void) printf("\tcheckpoint_txg = %llu\n",
4165 (u_longlong_t)ub->ub_checkpoint_txg);
4166
4167 (void) printf("\traidz_reflow state=%u off=%llu\n",
4168 (int)RRSS_GET_STATE(ub),
4169 (u_longlong_t)RRSS_GET_OFFSET(ub));
4170
4171 (void) printf("%s", footer ? footer : "");
4172 }
4173
4174 static void
dump_config(spa_t * spa)4175 dump_config(spa_t *spa)
4176 {
4177 dmu_buf_t *db;
4178 size_t nvsize = 0;
4179 int error = 0;
4180
4181
4182 error = dmu_bonus_hold(spa->spa_meta_objset,
4183 spa->spa_config_object, FTAG, &db);
4184
4185 if (error == 0) {
4186 nvsize = *(uint64_t *)db->db_data;
4187 dmu_buf_rele(db, FTAG);
4188
4189 (void) printf("\nMOS Configuration:\n");
4190 dump_packed_nvlist(spa->spa_meta_objset,
4191 spa->spa_config_object, (void *)&nvsize, 1);
4192 } else {
4193 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
4194 (u_longlong_t)spa->spa_config_object, error);
4195 }
4196 }
4197
4198 static void
dump_cachefile(const char * cachefile)4199 dump_cachefile(const char *cachefile)
4200 {
4201 int fd;
4202 struct stat64 statbuf;
4203 char *buf;
4204 nvlist_t *config;
4205
4206 if ((fd = open64(cachefile, O_RDONLY)) < 0) {
4207 (void) printf("cannot open '%s': %s\n", cachefile,
4208 strerror(errno));
4209 zdb_exit(1);
4210 }
4211
4212 if (fstat64(fd, &statbuf) != 0) {
4213 (void) printf("failed to stat '%s': %s\n", cachefile,
4214 strerror(errno));
4215 zdb_exit(1);
4216 }
4217
4218 if ((buf = malloc(statbuf.st_size)) == NULL) {
4219 (void) fprintf(stderr, "failed to allocate %llu bytes\n",
4220 (u_longlong_t)statbuf.st_size);
4221 zdb_exit(1);
4222 }
4223
4224 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
4225 (void) fprintf(stderr, "failed to read %llu bytes\n",
4226 (u_longlong_t)statbuf.st_size);
4227 zdb_exit(1);
4228 }
4229
4230 (void) close(fd);
4231
4232 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
4233 (void) fprintf(stderr, "failed to unpack nvlist\n");
4234 zdb_exit(1);
4235 }
4236
4237 free(buf);
4238
4239 dump_nvlist(config, 0);
4240
4241 nvlist_free(config);
4242 }
4243
4244 /*
4245 * ZFS label nvlist stats
4246 */
4247 typedef struct zdb_nvl_stats {
4248 int zns_list_count;
4249 int zns_leaf_count;
4250 size_t zns_leaf_largest;
4251 size_t zns_leaf_total;
4252 nvlist_t *zns_string;
4253 nvlist_t *zns_uint64;
4254 nvlist_t *zns_boolean;
4255 } zdb_nvl_stats_t;
4256
4257 static void
collect_nvlist_stats(nvlist_t * nvl,zdb_nvl_stats_t * stats)4258 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
4259 {
4260 nvlist_t *list, **array;
4261 nvpair_t *nvp = NULL;
4262 const char *name;
4263 uint_t i, items;
4264
4265 stats->zns_list_count++;
4266
4267 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4268 name = nvpair_name(nvp);
4269
4270 switch (nvpair_type(nvp)) {
4271 case DATA_TYPE_STRING:
4272 fnvlist_add_string(stats->zns_string, name,
4273 fnvpair_value_string(nvp));
4274 break;
4275 case DATA_TYPE_UINT64:
4276 fnvlist_add_uint64(stats->zns_uint64, name,
4277 fnvpair_value_uint64(nvp));
4278 break;
4279 case DATA_TYPE_BOOLEAN:
4280 fnvlist_add_boolean(stats->zns_boolean, name);
4281 break;
4282 case DATA_TYPE_NVLIST:
4283 if (nvpair_value_nvlist(nvp, &list) == 0)
4284 collect_nvlist_stats(list, stats);
4285 break;
4286 case DATA_TYPE_NVLIST_ARRAY:
4287 if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
4288 break;
4289
4290 for (i = 0; i < items; i++) {
4291 collect_nvlist_stats(array[i], stats);
4292
4293 /* collect stats on leaf vdev */
4294 if (strcmp(name, "children") == 0) {
4295 size_t size;
4296
4297 (void) nvlist_size(array[i], &size,
4298 NV_ENCODE_XDR);
4299 stats->zns_leaf_total += size;
4300 if (size > stats->zns_leaf_largest)
4301 stats->zns_leaf_largest = size;
4302 stats->zns_leaf_count++;
4303 }
4304 }
4305 break;
4306 default:
4307 (void) printf("skip type %d!\n", (int)nvpair_type(nvp));
4308 }
4309 }
4310 }
4311
4312 static void
dump_nvlist_stats(nvlist_t * nvl,size_t cap)4313 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
4314 {
4315 zdb_nvl_stats_t stats = { 0 };
4316 size_t size, sum = 0, total;
4317 size_t noise;
4318
4319 /* requires nvlist with non-unique names for stat collection */
4320 VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
4321 VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
4322 VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
4323 VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
4324
4325 (void) printf("\n\nZFS Label NVList Config Stats:\n");
4326
4327 VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
4328 (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n",
4329 (int)total, (int)(cap - total), 100.0 * total / cap);
4330
4331 collect_nvlist_stats(nvl, &stats);
4332
4333 VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
4334 size -= noise;
4335 sum += size;
4336 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
4337 (int)fnvlist_num_pairs(stats.zns_uint64),
4338 (int)size, 100.0 * size / total);
4339
4340 VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
4341 size -= noise;
4342 sum += size;
4343 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
4344 (int)fnvlist_num_pairs(stats.zns_string),
4345 (int)size, 100.0 * size / total);
4346
4347 VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
4348 size -= noise;
4349 sum += size;
4350 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
4351 (int)fnvlist_num_pairs(stats.zns_boolean),
4352 (int)size, 100.0 * size / total);
4353
4354 size = total - sum; /* treat remainder as nvlist overhead */
4355 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
4356 stats.zns_list_count, (int)size, 100.0 * size / total);
4357
4358 if (stats.zns_leaf_count > 0) {
4359 size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
4360
4361 (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
4362 stats.zns_leaf_count, (int)average);
4363 (void) printf("%24d bytes largest\n",
4364 (int)stats.zns_leaf_largest);
4365
4366 if (dump_opt['l'] >= 3 && average > 0)
4367 (void) printf(" space for %d additional leaf vdevs\n",
4368 (int)((cap - total) / average));
4369 }
4370 (void) printf("\n");
4371
4372 nvlist_free(stats.zns_string);
4373 nvlist_free(stats.zns_uint64);
4374 nvlist_free(stats.zns_boolean);
4375 }
4376
4377 typedef struct cksum_record {
4378 zio_cksum_t cksum;
4379 boolean_t labels[VDEV_LABELS];
4380 avl_node_t link;
4381 } cksum_record_t;
4382
4383 static int
cksum_record_compare(const void * x1,const void * x2)4384 cksum_record_compare(const void *x1, const void *x2)
4385 {
4386 const cksum_record_t *l = (cksum_record_t *)x1;
4387 const cksum_record_t *r = (cksum_record_t *)x2;
4388 int arraysize = ARRAY_SIZE(l->cksum.zc_word);
4389 int difference = 0;
4390
4391 for (int i = 0; i < arraysize; i++) {
4392 difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
4393 if (difference)
4394 break;
4395 }
4396
4397 return (difference);
4398 }
4399
4400 static cksum_record_t *
cksum_record_alloc(zio_cksum_t * cksum,int l)4401 cksum_record_alloc(zio_cksum_t *cksum, int l)
4402 {
4403 cksum_record_t *rec;
4404
4405 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
4406 rec->cksum = *cksum;
4407 rec->labels[l] = B_TRUE;
4408
4409 return (rec);
4410 }
4411
4412 static cksum_record_t *
cksum_record_lookup(avl_tree_t * tree,zio_cksum_t * cksum)4413 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
4414 {
4415 cksum_record_t lookup = { .cksum = *cksum };
4416 avl_index_t where;
4417
4418 return (avl_find(tree, &lookup, &where));
4419 }
4420
4421 static cksum_record_t *
cksum_record_insert(avl_tree_t * tree,zio_cksum_t * cksum,int l)4422 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
4423 {
4424 cksum_record_t *rec;
4425
4426 rec = cksum_record_lookup(tree, cksum);
4427 if (rec) {
4428 rec->labels[l] = B_TRUE;
4429 } else {
4430 rec = cksum_record_alloc(cksum, l);
4431 avl_add(tree, rec);
4432 }
4433
4434 return (rec);
4435 }
4436
4437 static int
first_label(cksum_record_t * rec)4438 first_label(cksum_record_t *rec)
4439 {
4440 for (int i = 0; i < VDEV_LABELS; i++)
4441 if (rec->labels[i])
4442 return (i);
4443
4444 return (-1);
4445 }
4446
4447 static void
print_label_numbers(const char * prefix,const cksum_record_t * rec)4448 print_label_numbers(const char *prefix, const cksum_record_t *rec)
4449 {
4450 fputs(prefix, stdout);
4451 for (int i = 0; i < VDEV_LABELS; i++)
4452 if (rec->labels[i] == B_TRUE)
4453 printf("%d ", i);
4454 putchar('\n');
4455 }
4456
4457 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
4458
4459 typedef struct zdb_label {
4460 vdev_label_t label;
4461 uint64_t label_offset;
4462 nvlist_t *config_nv;
4463 cksum_record_t *config;
4464 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
4465 boolean_t header_printed;
4466 boolean_t read_failed;
4467 boolean_t cksum_valid;
4468 } zdb_label_t;
4469
4470 static void
print_label_header(zdb_label_t * label,int l)4471 print_label_header(zdb_label_t *label, int l)
4472 {
4473
4474 if (dump_opt['q'])
4475 return;
4476
4477 if (label->header_printed == B_TRUE)
4478 return;
4479
4480 (void) printf("------------------------------------\n");
4481 (void) printf("LABEL %d %s\n", l,
4482 label->cksum_valid ? "" : "(Bad label cksum)");
4483 (void) printf("------------------------------------\n");
4484
4485 label->header_printed = B_TRUE;
4486 }
4487
4488 static void
print_l2arc_header(void)4489 print_l2arc_header(void)
4490 {
4491 (void) printf("------------------------------------\n");
4492 (void) printf("L2ARC device header\n");
4493 (void) printf("------------------------------------\n");
4494 }
4495
4496 static void
print_l2arc_log_blocks(void)4497 print_l2arc_log_blocks(void)
4498 {
4499 (void) printf("------------------------------------\n");
4500 (void) printf("L2ARC device log blocks\n");
4501 (void) printf("------------------------------------\n");
4502 }
4503
4504 static void
dump_l2arc_log_entries(uint64_t log_entries,l2arc_log_ent_phys_t * le,uint64_t i)4505 dump_l2arc_log_entries(uint64_t log_entries,
4506 l2arc_log_ent_phys_t *le, uint64_t i)
4507 {
4508 for (int j = 0; j < log_entries; j++) {
4509 dva_t dva = le[j].le_dva;
4510 (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
4511 "vdev: %llu, offset: %llu\n",
4512 (u_longlong_t)i, j + 1,
4513 (u_longlong_t)DVA_GET_ASIZE(&dva),
4514 (u_longlong_t)DVA_GET_VDEV(&dva),
4515 (u_longlong_t)DVA_GET_OFFSET(&dva));
4516 (void) printf("|\t\t\t\tbirth: %llu\n",
4517 (u_longlong_t)le[j].le_birth);
4518 (void) printf("|\t\t\t\tlsize: %llu\n",
4519 (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
4520 (void) printf("|\t\t\t\tpsize: %llu\n",
4521 (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
4522 (void) printf("|\t\t\t\tcompr: %llu\n",
4523 (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
4524 (void) printf("|\t\t\t\tcomplevel: %llu\n",
4525 (u_longlong_t)(&le[j])->le_complevel);
4526 (void) printf("|\t\t\t\ttype: %llu\n",
4527 (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
4528 (void) printf("|\t\t\t\tprotected: %llu\n",
4529 (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
4530 (void) printf("|\t\t\t\tprefetch: %llu\n",
4531 (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
4532 (void) printf("|\t\t\t\taddress: %llu\n",
4533 (u_longlong_t)le[j].le_daddr);
4534 (void) printf("|\t\t\t\tARC state: %llu\n",
4535 (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
4536 (void) printf("|\n");
4537 }
4538 (void) printf("\n");
4539 }
4540
4541 static void
dump_l2arc_log_blkptr(const l2arc_log_blkptr_t * lbps)4542 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
4543 {
4544 (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
4545 (void) printf("|\t\tpayload_asize: %llu\n",
4546 (u_longlong_t)lbps->lbp_payload_asize);
4547 (void) printf("|\t\tpayload_start: %llu\n",
4548 (u_longlong_t)lbps->lbp_payload_start);
4549 (void) printf("|\t\tlsize: %llu\n",
4550 (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
4551 (void) printf("|\t\tasize: %llu\n",
4552 (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
4553 (void) printf("|\t\tcompralgo: %llu\n",
4554 (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
4555 (void) printf("|\t\tcksumalgo: %llu\n",
4556 (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
4557 (void) printf("|\n\n");
4558 }
4559
4560 static void
dump_l2arc_log_blocks(int fd,const l2arc_dev_hdr_phys_t * l2dhdr,l2arc_dev_hdr_phys_t * rebuild)4561 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
4562 l2arc_dev_hdr_phys_t *rebuild)
4563 {
4564 l2arc_log_blk_phys_t this_lb;
4565 uint64_t asize;
4566 l2arc_log_blkptr_t lbps[2];
4567 abd_t *abd;
4568 zio_cksum_t cksum;
4569 int failed = 0;
4570 l2arc_dev_t dev;
4571
4572 if (!dump_opt['q'])
4573 print_l2arc_log_blocks();
4574 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
4575
4576 dev.l2ad_evict = l2dhdr->dh_evict;
4577 dev.l2ad_start = l2dhdr->dh_start;
4578 dev.l2ad_end = l2dhdr->dh_end;
4579
4580 if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
4581 /* no log blocks to read */
4582 if (!dump_opt['q']) {
4583 (void) printf("No log blocks to read\n");
4584 (void) printf("\n");
4585 }
4586 return;
4587 } else {
4588 dev.l2ad_hand = lbps[0].lbp_daddr +
4589 L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
4590 }
4591
4592 dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
4593
4594 for (;;) {
4595 if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
4596 break;
4597
4598 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
4599 asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
4600 if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
4601 if (!dump_opt['q']) {
4602 (void) printf("Error while reading next log "
4603 "block\n\n");
4604 }
4605 break;
4606 }
4607
4608 fletcher_4_native_varsize(&this_lb, asize, &cksum);
4609 if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
4610 failed++;
4611 if (!dump_opt['q']) {
4612 (void) printf("Invalid cksum\n");
4613 dump_l2arc_log_blkptr(&lbps[0]);
4614 }
4615 break;
4616 }
4617
4618 switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
4619 case ZIO_COMPRESS_OFF:
4620 break;
4621 default:
4622 abd = abd_alloc_for_io(asize, B_TRUE);
4623 abd_copy_from_buf_off(abd, &this_lb, 0, asize);
4624 if (zio_decompress_data(L2BLK_GET_COMPRESS(
4625 (&lbps[0])->lbp_prop), abd, &this_lb,
4626 asize, sizeof (this_lb), NULL) != 0) {
4627 (void) printf("L2ARC block decompression "
4628 "failed\n");
4629 abd_free(abd);
4630 goto out;
4631 }
4632 abd_free(abd);
4633 break;
4634 }
4635
4636 if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
4637 byteswap_uint64_array(&this_lb, sizeof (this_lb));
4638 if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
4639 if (!dump_opt['q'])
4640 (void) printf("Invalid log block magic\n\n");
4641 break;
4642 }
4643
4644 rebuild->dh_lb_count++;
4645 rebuild->dh_lb_asize += asize;
4646 if (dump_opt['l'] > 1 && !dump_opt['q']) {
4647 (void) printf("lb[%4llu]\tmagic: %llu\n",
4648 (u_longlong_t)rebuild->dh_lb_count,
4649 (u_longlong_t)this_lb.lb_magic);
4650 dump_l2arc_log_blkptr(&lbps[0]);
4651 }
4652
4653 if (dump_opt['l'] > 2 && !dump_opt['q'])
4654 dump_l2arc_log_entries(l2dhdr->dh_log_entries,
4655 this_lb.lb_entries,
4656 rebuild->dh_lb_count);
4657
4658 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
4659 lbps[0].lbp_payload_start, dev.l2ad_evict) &&
4660 !dev.l2ad_first)
4661 break;
4662
4663 lbps[0] = lbps[1];
4664 lbps[1] = this_lb.lb_prev_lbp;
4665 }
4666 out:
4667 if (!dump_opt['q']) {
4668 (void) printf("log_blk_count:\t %llu with valid cksum\n",
4669 (u_longlong_t)rebuild->dh_lb_count);
4670 (void) printf("\t\t %d with invalid cksum\n", failed);
4671 (void) printf("log_blk_asize:\t %llu\n\n",
4672 (u_longlong_t)rebuild->dh_lb_asize);
4673 }
4674 }
4675
4676 static int
dump_l2arc_header(int fd)4677 dump_l2arc_header(int fd)
4678 {
4679 l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
4680 int error = B_FALSE;
4681
4682 if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
4683 VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
4684 error = B_TRUE;
4685 } else {
4686 if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
4687 byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
4688
4689 if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
4690 error = B_TRUE;
4691 }
4692
4693 if (error) {
4694 (void) printf("L2ARC device header not found\n\n");
4695 /* Do not return an error here for backward compatibility */
4696 return (0);
4697 } else if (!dump_opt['q']) {
4698 print_l2arc_header();
4699
4700 (void) printf(" magic: %llu\n",
4701 (u_longlong_t)l2dhdr.dh_magic);
4702 (void) printf(" version: %llu\n",
4703 (u_longlong_t)l2dhdr.dh_version);
4704 (void) printf(" pool_guid: %llu\n",
4705 (u_longlong_t)l2dhdr.dh_spa_guid);
4706 (void) printf(" flags: %llu\n",
4707 (u_longlong_t)l2dhdr.dh_flags);
4708 (void) printf(" start_lbps[0]: %llu\n",
4709 (u_longlong_t)
4710 l2dhdr.dh_start_lbps[0].lbp_daddr);
4711 (void) printf(" start_lbps[1]: %llu\n",
4712 (u_longlong_t)
4713 l2dhdr.dh_start_lbps[1].lbp_daddr);
4714 (void) printf(" log_blk_ent: %llu\n",
4715 (u_longlong_t)l2dhdr.dh_log_entries);
4716 (void) printf(" start: %llu\n",
4717 (u_longlong_t)l2dhdr.dh_start);
4718 (void) printf(" end: %llu\n",
4719 (u_longlong_t)l2dhdr.dh_end);
4720 (void) printf(" evict: %llu\n",
4721 (u_longlong_t)l2dhdr.dh_evict);
4722 (void) printf(" lb_asize_refcount: %llu\n",
4723 (u_longlong_t)l2dhdr.dh_lb_asize);
4724 (void) printf(" lb_count_refcount: %llu\n",
4725 (u_longlong_t)l2dhdr.dh_lb_count);
4726 (void) printf(" trim_action_time: %llu\n",
4727 (u_longlong_t)l2dhdr.dh_trim_action_time);
4728 (void) printf(" trim_state: %llu\n\n",
4729 (u_longlong_t)l2dhdr.dh_trim_state);
4730 }
4731
4732 dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
4733 /*
4734 * The total aligned size of log blocks and the number of log blocks
4735 * reported in the header of the device may be less than what zdb
4736 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
4737 * This happens because dump_l2arc_log_blocks() lacks the memory
4738 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
4739 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
4740 * and dh_lb_count will be lower to begin with than what exists on the
4741 * device. This is normal and zdb should not exit with an error. The
4742 * opposite case should never happen though, the values reported in the
4743 * header should never be higher than what dump_l2arc_log_blocks() and
4744 * l2arc_rebuild() report. If this happens there is a leak in the
4745 * accounting of log blocks.
4746 */
4747 if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
4748 l2dhdr.dh_lb_count > rebuild.dh_lb_count)
4749 return (1);
4750
4751 return (0);
4752 }
4753
4754 static void
dump_config_from_label(zdb_label_t * label,size_t buflen,int l)4755 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
4756 {
4757 if (dump_opt['q'])
4758 return;
4759
4760 if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
4761 return;
4762
4763 print_label_header(label, l);
4764 dump_nvlist(label->config_nv, 4);
4765 print_label_numbers(" labels = ", label->config);
4766
4767 if (dump_opt['l'] >= 2)
4768 dump_nvlist_stats(label->config_nv, buflen);
4769 }
4770
4771 #define ZDB_MAX_UB_HEADER_SIZE 32
4772
4773 static void
dump_label_uberblocks(zdb_label_t * label,uint64_t ashift,int label_num)4774 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
4775 {
4776
4777 vdev_t vd;
4778 char header[ZDB_MAX_UB_HEADER_SIZE];
4779
4780 vd.vdev_ashift = ashift;
4781 vd.vdev_top = &vd;
4782
4783 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
4784 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
4785 uberblock_t *ub = (void *)((char *)&label->label + uoff);
4786 cksum_record_t *rec = label->uberblocks[i];
4787
4788 if (rec == NULL) {
4789 if (dump_opt['u'] >= 2) {
4790 print_label_header(label, label_num);
4791 (void) printf(" Uberblock[%d] invalid\n", i);
4792 }
4793 continue;
4794 }
4795
4796 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
4797 continue;
4798
4799 if ((dump_opt['u'] < 4) &&
4800 (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
4801 (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
4802 continue;
4803
4804 print_label_header(label, label_num);
4805 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
4806 " Uberblock[%d]\n", i);
4807 dump_uberblock(ub, header, "");
4808 print_label_numbers(" labels = ", rec);
4809 }
4810 }
4811
4812 static char curpath[PATH_MAX];
4813
4814 /*
4815 * Iterate through the path components, recursively passing
4816 * current one's obj and remaining path until we find the obj
4817 * for the last one.
4818 */
4819 static int
dump_path_impl(objset_t * os,uint64_t obj,char * name,uint64_t * retobj)4820 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
4821 {
4822 int err;
4823 boolean_t header = B_TRUE;
4824 uint64_t child_obj;
4825 char *s;
4826 dmu_buf_t *db;
4827 dmu_object_info_t doi;
4828
4829 if ((s = strchr(name, '/')) != NULL)
4830 *s = '\0';
4831 err = zap_lookup(os, obj, name, 8, 1, &child_obj);
4832
4833 (void) strlcat(curpath, name, sizeof (curpath));
4834
4835 if (err != 0) {
4836 (void) fprintf(stderr, "failed to lookup %s: %s\n",
4837 curpath, strerror(err));
4838 return (err);
4839 }
4840
4841 child_obj = ZFS_DIRENT_OBJ(child_obj);
4842 err = sa_buf_hold(os, child_obj, FTAG, &db);
4843 if (err != 0) {
4844 (void) fprintf(stderr,
4845 "failed to get SA dbuf for obj %llu: %s\n",
4846 (u_longlong_t)child_obj, strerror(err));
4847 return (EINVAL);
4848 }
4849 dmu_object_info_from_db(db, &doi);
4850 sa_buf_rele(db, FTAG);
4851
4852 if (doi.doi_bonus_type != DMU_OT_SA &&
4853 doi.doi_bonus_type != DMU_OT_ZNODE) {
4854 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
4855 doi.doi_bonus_type, (u_longlong_t)child_obj);
4856 return (EINVAL);
4857 }
4858
4859 if (dump_opt['v'] > 6) {
4860 (void) printf("obj=%llu %s type=%d bonustype=%d\n",
4861 (u_longlong_t)child_obj, curpath, doi.doi_type,
4862 doi.doi_bonus_type);
4863 }
4864
4865 (void) strlcat(curpath, "/", sizeof (curpath));
4866
4867 switch (doi.doi_type) {
4868 case DMU_OT_DIRECTORY_CONTENTS:
4869 if (s != NULL && *(s + 1) != '\0')
4870 return (dump_path_impl(os, child_obj, s + 1, retobj));
4871 zfs_fallthrough;
4872 case DMU_OT_PLAIN_FILE_CONTENTS:
4873 if (retobj != NULL) {
4874 *retobj = child_obj;
4875 } else {
4876 dump_object(os, child_obj, dump_opt['v'], &header,
4877 NULL, 0);
4878 }
4879 return (0);
4880 default:
4881 (void) fprintf(stderr, "object %llu has non-file/directory "
4882 "type %d\n", (u_longlong_t)obj, doi.doi_type);
4883 break;
4884 }
4885
4886 return (EINVAL);
4887 }
4888
4889 /*
4890 * Dump the blocks for the object specified by path inside the dataset.
4891 */
4892 static int
dump_path(char * ds,char * path,uint64_t * retobj)4893 dump_path(char *ds, char *path, uint64_t *retobj)
4894 {
4895 int err;
4896 objset_t *os;
4897 uint64_t root_obj;
4898
4899 err = open_objset(ds, FTAG, &os);
4900 if (err != 0)
4901 return (err);
4902
4903 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
4904 if (err != 0) {
4905 (void) fprintf(stderr, "can't lookup root znode: %s\n",
4906 strerror(err));
4907 close_objset(os, FTAG);
4908 return (EINVAL);
4909 }
4910
4911 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
4912
4913 err = dump_path_impl(os, root_obj, path, retobj);
4914
4915 close_objset(os, FTAG);
4916 return (err);
4917 }
4918
4919 static int
dump_backup_bytes(objset_t * os,void * buf,int len,void * arg)4920 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
4921 {
4922 const char *p = (const char *)buf;
4923 ssize_t nwritten;
4924
4925 (void) os;
4926 (void) arg;
4927
4928 /* Write the data out, handling short writes and signals. */
4929 while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
4930 if (nwritten < 0) {
4931 if (errno == EINTR)
4932 continue;
4933 return (errno);
4934 }
4935 p += nwritten;
4936 len -= nwritten;
4937 }
4938
4939 return (0);
4940 }
4941
4942 static void
dump_backup(const char * pool,uint64_t objset_id,const char * flagstr)4943 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
4944 {
4945 boolean_t embed = B_FALSE;
4946 boolean_t large_block = B_FALSE;
4947 boolean_t compress = B_FALSE;
4948 boolean_t raw = B_FALSE;
4949
4950 const char *c;
4951 for (c = flagstr; c != NULL && *c != '\0'; c++) {
4952 switch (*c) {
4953 case 'e':
4954 embed = B_TRUE;
4955 break;
4956 case 'L':
4957 large_block = B_TRUE;
4958 break;
4959 case 'c':
4960 compress = B_TRUE;
4961 break;
4962 case 'w':
4963 raw = B_TRUE;
4964 break;
4965 default:
4966 fprintf(stderr, "dump_backup: invalid flag "
4967 "'%c'\n", *c);
4968 return;
4969 }
4970 }
4971
4972 if (isatty(STDOUT_FILENO)) {
4973 fprintf(stderr, "dump_backup: stream cannot be written "
4974 "to a terminal\n");
4975 return;
4976 }
4977
4978 offset_t off = 0;
4979 dmu_send_outparams_t out = {
4980 .dso_outfunc = dump_backup_bytes,
4981 .dso_dryrun = B_FALSE,
4982 };
4983
4984 int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
4985 large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
4986 &off, &out);
4987 if (err != 0) {
4988 fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
4989 strerror(err));
4990 return;
4991 }
4992 }
4993
4994 static int
zdb_copy_object(objset_t * os,uint64_t srcobj,char * destfile)4995 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
4996 {
4997 int err = 0;
4998 uint64_t size, readsize, oursize, offset;
4999 ssize_t writesize;
5000 sa_handle_t *hdl;
5001
5002 (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
5003 destfile);
5004
5005 VERIFY3P(os, ==, sa_os);
5006 if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
5007 (void) printf("Failed to get handle for SA znode\n");
5008 return (err);
5009 }
5010 if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
5011 (void) sa_handle_destroy(hdl);
5012 return (err);
5013 }
5014 (void) sa_handle_destroy(hdl);
5015
5016 (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
5017 size);
5018 if (size == 0) {
5019 return (EINVAL);
5020 }
5021
5022 int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
5023 if (fd == -1)
5024 return (errno);
5025 /*
5026 * We cap the size at 1 mebibyte here to prevent
5027 * allocation failures and nigh-infinite printing if the
5028 * object is extremely large.
5029 */
5030 oursize = MIN(size, 1 << 20);
5031 offset = 0;
5032 char *buf = kmem_alloc(oursize, KM_NOSLEEP);
5033 if (buf == NULL) {
5034 (void) close(fd);
5035 return (ENOMEM);
5036 }
5037
5038 while (offset < size) {
5039 readsize = MIN(size - offset, 1 << 20);
5040 err = dmu_read(os, srcobj, offset, readsize, buf, 0);
5041 if (err != 0) {
5042 (void) printf("got error %u from dmu_read\n", err);
5043 kmem_free(buf, oursize);
5044 (void) close(fd);
5045 return (err);
5046 }
5047 if (dump_opt['v'] > 3) {
5048 (void) printf("Read offset=%" PRIu64 " size=%" PRIu64
5049 " error=%d\n", offset, readsize, err);
5050 }
5051
5052 writesize = write(fd, buf, readsize);
5053 if (writesize < 0) {
5054 err = errno;
5055 break;
5056 } else if (writesize != readsize) {
5057 /* Incomplete write */
5058 (void) fprintf(stderr, "Short write, only wrote %llu of"
5059 " %" PRIu64 " bytes, exiting...\n",
5060 (u_longlong_t)writesize, readsize);
5061 break;
5062 }
5063
5064 offset += readsize;
5065 }
5066
5067 (void) close(fd);
5068
5069 if (buf != NULL)
5070 kmem_free(buf, oursize);
5071
5072 return (err);
5073 }
5074
5075 static boolean_t
label_cksum_valid(vdev_label_t * label,uint64_t offset)5076 label_cksum_valid(vdev_label_t *label, uint64_t offset)
5077 {
5078 zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
5079 zio_cksum_t expected_cksum;
5080 zio_cksum_t actual_cksum;
5081 zio_cksum_t verifier;
5082 zio_eck_t *eck;
5083 int byteswap;
5084
5085 void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
5086 eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
5087
5088 offset += offsetof(vdev_label_t, vl_vdev_phys);
5089 ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
5090
5091 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
5092 if (byteswap)
5093 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
5094
5095 expected_cksum = eck->zec_cksum;
5096 eck->zec_cksum = verifier;
5097
5098 abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
5099 ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
5100 abd_free(abd);
5101
5102 if (byteswap)
5103 byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
5104
5105 if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
5106 return (B_TRUE);
5107
5108 return (B_FALSE);
5109 }
5110
5111 static int
dump_label(const char * dev)5112 dump_label(const char *dev)
5113 {
5114 char path[MAXPATHLEN];
5115 zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
5116 uint64_t psize, ashift, l2cache;
5117 struct stat64 statbuf;
5118 boolean_t config_found = B_FALSE;
5119 boolean_t error = B_FALSE;
5120 boolean_t read_l2arc_header = B_FALSE;
5121 avl_tree_t config_tree;
5122 avl_tree_t uberblock_tree;
5123 void *node, *cookie;
5124 int fd;
5125
5126 /*
5127 * Check if we were given absolute path and use it as is.
5128 * Otherwise if the provided vdev name doesn't point to a file,
5129 * try prepending expected disk paths and partition numbers.
5130 */
5131 (void) strlcpy(path, dev, sizeof (path));
5132 if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
5133 int error;
5134
5135 error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
5136 if (error == 0 && zfs_dev_is_whole_disk(path)) {
5137 if (zfs_append_partition(path, MAXPATHLEN) == -1)
5138 error = ENOENT;
5139 }
5140
5141 if (error || (stat64(path, &statbuf) != 0)) {
5142 (void) printf("failed to find device %s, try "
5143 "specifying absolute path instead\n", dev);
5144 return (1);
5145 }
5146 }
5147
5148 if ((fd = open64(path, O_RDONLY)) < 0) {
5149 (void) printf("cannot open '%s': %s\n", path, strerror(errno));
5150 zdb_exit(1);
5151 }
5152
5153 if (fstat64_blk(fd, &statbuf) != 0) {
5154 (void) printf("failed to stat '%s': %s\n", path,
5155 strerror(errno));
5156 (void) close(fd);
5157 zdb_exit(1);
5158 }
5159
5160 if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
5161 (void) printf("failed to invalidate cache '%s' : %s\n", path,
5162 strerror(errno));
5163
5164 avl_create(&config_tree, cksum_record_compare,
5165 sizeof (cksum_record_t), offsetof(cksum_record_t, link));
5166 avl_create(&uberblock_tree, cksum_record_compare,
5167 sizeof (cksum_record_t), offsetof(cksum_record_t, link));
5168
5169 psize = statbuf.st_size;
5170 psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
5171 ashift = SPA_MINBLOCKSHIFT;
5172
5173 /*
5174 * 1. Read the label from disk
5175 * 2. Verify label cksum
5176 * 3. Unpack the configuration and insert in config tree.
5177 * 4. Traverse all uberblocks and insert in uberblock tree.
5178 */
5179 for (int l = 0; l < VDEV_LABELS; l++) {
5180 zdb_label_t *label = &labels[l];
5181 char *buf = label->label.vl_vdev_phys.vp_nvlist;
5182 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
5183 nvlist_t *config;
5184 cksum_record_t *rec;
5185 zio_cksum_t cksum;
5186 vdev_t vd;
5187
5188 label->label_offset = vdev_label_offset(psize, l, 0);
5189
5190 if (pread64(fd, &label->label, sizeof (label->label),
5191 label->label_offset) != sizeof (label->label)) {
5192 if (!dump_opt['q'])
5193 (void) printf("failed to read label %d\n", l);
5194 label->read_failed = B_TRUE;
5195 error = B_TRUE;
5196 continue;
5197 }
5198
5199 label->read_failed = B_FALSE;
5200 label->cksum_valid = label_cksum_valid(&label->label,
5201 label->label_offset);
5202
5203 if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
5204 nvlist_t *vdev_tree = NULL;
5205 size_t size;
5206
5207 if ((nvlist_lookup_nvlist(config,
5208 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
5209 (nvlist_lookup_uint64(vdev_tree,
5210 ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
5211 ashift = SPA_MINBLOCKSHIFT;
5212
5213 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
5214 size = buflen;
5215
5216 /* If the device is a cache device read the header. */
5217 if (!read_l2arc_header) {
5218 if (nvlist_lookup_uint64(config,
5219 ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
5220 l2cache == POOL_STATE_L2CACHE) {
5221 read_l2arc_header = B_TRUE;
5222 }
5223 }
5224
5225 fletcher_4_native_varsize(buf, size, &cksum);
5226 rec = cksum_record_insert(&config_tree, &cksum, l);
5227
5228 label->config = rec;
5229 label->config_nv = config;
5230 config_found = B_TRUE;
5231 } else {
5232 error = B_TRUE;
5233 }
5234
5235 vd.vdev_ashift = ashift;
5236 vd.vdev_top = &vd;
5237
5238 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
5239 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
5240 uberblock_t *ub = (void *)((char *)label + uoff);
5241
5242 if (uberblock_verify(ub))
5243 continue;
5244
5245 fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
5246 rec = cksum_record_insert(&uberblock_tree, &cksum, l);
5247
5248 label->uberblocks[i] = rec;
5249 }
5250 }
5251
5252 /*
5253 * Dump the label and uberblocks.
5254 */
5255 for (int l = 0; l < VDEV_LABELS; l++) {
5256 zdb_label_t *label = &labels[l];
5257 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
5258
5259 if (label->read_failed == B_TRUE)
5260 continue;
5261
5262 if (label->config_nv) {
5263 dump_config_from_label(label, buflen, l);
5264 } else {
5265 if (!dump_opt['q'])
5266 (void) printf("failed to unpack label %d\n", l);
5267 }
5268
5269 if (dump_opt['u'])
5270 dump_label_uberblocks(label, ashift, l);
5271
5272 nvlist_free(label->config_nv);
5273 }
5274
5275 /*
5276 * Dump the L2ARC header, if existent.
5277 */
5278 if (read_l2arc_header)
5279 error |= dump_l2arc_header(fd);
5280
5281 cookie = NULL;
5282 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
5283 umem_free(node, sizeof (cksum_record_t));
5284
5285 cookie = NULL;
5286 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
5287 umem_free(node, sizeof (cksum_record_t));
5288
5289 avl_destroy(&config_tree);
5290 avl_destroy(&uberblock_tree);
5291
5292 (void) close(fd);
5293
5294 return (config_found == B_FALSE ? 2 :
5295 (error == B_TRUE ? 1 : 0));
5296 }
5297
5298 static uint64_t dataset_feature_count[SPA_FEATURES];
5299 static uint64_t global_feature_count[SPA_FEATURES];
5300 static uint64_t remap_deadlist_count = 0;
5301
5302 static int
dump_one_objset(const char * dsname,void * arg)5303 dump_one_objset(const char *dsname, void *arg)
5304 {
5305 (void) arg;
5306 int error;
5307 objset_t *os;
5308 spa_feature_t f;
5309
5310 error = open_objset(dsname, FTAG, &os);
5311 if (error != 0)
5312 return (0);
5313
5314 for (f = 0; f < SPA_FEATURES; f++) {
5315 if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
5316 continue;
5317 ASSERT(spa_feature_table[f].fi_flags &
5318 ZFEATURE_FLAG_PER_DATASET);
5319 dataset_feature_count[f]++;
5320 }
5321
5322 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
5323 remap_deadlist_count++;
5324 }
5325
5326 for (dsl_bookmark_node_t *dbn =
5327 avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
5328 dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
5329 mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
5330 if (dbn->dbn_phys.zbm_redaction_obj != 0) {
5331 global_feature_count[
5332 SPA_FEATURE_REDACTION_BOOKMARKS]++;
5333 objset_t *mos = os->os_spa->spa_meta_objset;
5334 dnode_t *rl;
5335 VERIFY0(dnode_hold(mos,
5336 dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
5337 if (rl->dn_have_spill) {
5338 global_feature_count[
5339 SPA_FEATURE_REDACTION_LIST_SPILL]++;
5340 }
5341 }
5342 if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
5343 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
5344 }
5345
5346 if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
5347 !dmu_objset_is_snapshot(os)) {
5348 global_feature_count[SPA_FEATURE_LIVELIST]++;
5349 }
5350
5351 dump_objset(os);
5352 close_objset(os, FTAG);
5353 fuid_table_destroy();
5354 return (0);
5355 }
5356
5357 /*
5358 * Block statistics.
5359 */
5360 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
5361 typedef struct zdb_blkstats {
5362 uint64_t zb_asize;
5363 uint64_t zb_lsize;
5364 uint64_t zb_psize;
5365 uint64_t zb_count;
5366 uint64_t zb_gangs;
5367 uint64_t zb_ditto_samevdev;
5368 uint64_t zb_ditto_same_ms;
5369 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
5370 } zdb_blkstats_t;
5371
5372 /*
5373 * Extended object types to report deferred frees and dedup auto-ditto blocks.
5374 */
5375 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
5376 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
5377 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
5378 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
5379
5380 static const char *zdb_ot_extname[] = {
5381 "deferred free",
5382 "dedup ditto",
5383 "other",
5384 "Total",
5385 };
5386
5387 #define ZB_TOTAL DN_MAX_LEVELS
5388 #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
5389
5390 typedef struct zdb_brt_entry {
5391 dva_t zbre_dva;
5392 uint64_t zbre_refcount;
5393 avl_node_t zbre_node;
5394 } zdb_brt_entry_t;
5395
5396 typedef struct zdb_cb {
5397 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
5398 uint64_t zcb_removing_size;
5399 uint64_t zcb_checkpoint_size;
5400 uint64_t zcb_dedup_asize;
5401 uint64_t zcb_dedup_blocks;
5402 uint64_t zcb_clone_asize;
5403 uint64_t zcb_clone_blocks;
5404 uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
5405 uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
5406 uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
5407 uint64_t zcb_psize_len[SPA_MAX_FOR_16M];
5408 uint64_t zcb_lsize_len[SPA_MAX_FOR_16M];
5409 uint64_t zcb_asize_len[SPA_MAX_FOR_16M];
5410 uint64_t zcb_psize_total;
5411 uint64_t zcb_lsize_total;
5412 uint64_t zcb_asize_total;
5413 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
5414 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
5415 [BPE_PAYLOAD_SIZE + 1];
5416 uint64_t zcb_start;
5417 hrtime_t zcb_lastprint;
5418 uint64_t zcb_totalasize;
5419 uint64_t zcb_errors[256];
5420 int zcb_readfails;
5421 int zcb_haderrors;
5422 spa_t *zcb_spa;
5423 uint32_t **zcb_vd_obsolete_counts;
5424 avl_tree_t zcb_brt;
5425 boolean_t zcb_brt_is_active;
5426 } zdb_cb_t;
5427
5428 /* test if two DVA offsets from same vdev are within the same metaslab */
5429 static boolean_t
same_metaslab(spa_t * spa,uint64_t vdev,uint64_t off1,uint64_t off2)5430 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
5431 {
5432 vdev_t *vd = vdev_lookup_top(spa, vdev);
5433 uint64_t ms_shift = vd->vdev_ms_shift;
5434
5435 return ((off1 >> ms_shift) == (off2 >> ms_shift));
5436 }
5437
5438 /*
5439 * Used to simplify reporting of the histogram data.
5440 */
5441 typedef struct one_histo {
5442 const char *name;
5443 uint64_t *count;
5444 uint64_t *len;
5445 uint64_t cumulative;
5446 } one_histo_t;
5447
5448 /*
5449 * The number of separate histograms processed for psize, lsize and asize.
5450 */
5451 #define NUM_HISTO 3
5452
5453 /*
5454 * This routine will create a fixed column size output of three different
5455 * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
5456 * the count, length and cumulative length of the psize, lsize and
5457 * asize blocks.
5458 *
5459 * All three types of blocks are listed on a single line
5460 *
5461 * By default the table is printed in nicenumber format (e.g. 123K) but
5462 * if the '-P' parameter is specified then the full raw number (parseable)
5463 * is printed out.
5464 */
5465 static void
dump_size_histograms(zdb_cb_t * zcb)5466 dump_size_histograms(zdb_cb_t *zcb)
5467 {
5468 /*
5469 * A temporary buffer that allows us to convert a number into
5470 * a string using zdb_nicenumber to allow either raw or human
5471 * readable numbers to be output.
5472 */
5473 char numbuf[32];
5474
5475 /*
5476 * Define titles which are used in the headers of the tables
5477 * printed by this routine.
5478 */
5479 const char blocksize_title1[] = "block";
5480 const char blocksize_title2[] = "size";
5481 const char count_title[] = "Count";
5482 const char length_title[] = "Size";
5483 const char cumulative_title[] = "Cum.";
5484
5485 /*
5486 * Setup the histogram arrays (psize, lsize, and asize).
5487 */
5488 one_histo_t parm_histo[NUM_HISTO];
5489
5490 parm_histo[0].name = "psize";
5491 parm_histo[0].count = zcb->zcb_psize_count;
5492 parm_histo[0].len = zcb->zcb_psize_len;
5493 parm_histo[0].cumulative = 0;
5494
5495 parm_histo[1].name = "lsize";
5496 parm_histo[1].count = zcb->zcb_lsize_count;
5497 parm_histo[1].len = zcb->zcb_lsize_len;
5498 parm_histo[1].cumulative = 0;
5499
5500 parm_histo[2].name = "asize";
5501 parm_histo[2].count = zcb->zcb_asize_count;
5502 parm_histo[2].len = zcb->zcb_asize_len;
5503 parm_histo[2].cumulative = 0;
5504
5505
5506 (void) printf("\nBlock Size Histogram\n");
5507 /*
5508 * Print the first line titles
5509 */
5510 if (dump_opt['P'])
5511 (void) printf("\n%s\t", blocksize_title1);
5512 else
5513 (void) printf("\n%7s ", blocksize_title1);
5514
5515 for (int j = 0; j < NUM_HISTO; j++) {
5516 if (dump_opt['P']) {
5517 if (j < NUM_HISTO - 1) {
5518 (void) printf("%s\t\t\t", parm_histo[j].name);
5519 } else {
5520 /* Don't print trailing spaces */
5521 (void) printf(" %s", parm_histo[j].name);
5522 }
5523 } else {
5524 if (j < NUM_HISTO - 1) {
5525 /* Left aligned strings in the output */
5526 (void) printf("%-7s ",
5527 parm_histo[j].name);
5528 } else {
5529 /* Don't print trailing spaces */
5530 (void) printf("%s", parm_histo[j].name);
5531 }
5532 }
5533 }
5534 (void) printf("\n");
5535
5536 /*
5537 * Print the second line titles
5538 */
5539 if (dump_opt['P']) {
5540 (void) printf("%s\t", blocksize_title2);
5541 } else {
5542 (void) printf("%7s ", blocksize_title2);
5543 }
5544
5545 for (int i = 0; i < NUM_HISTO; i++) {
5546 if (dump_opt['P']) {
5547 (void) printf("%s\t%s\t%s\t",
5548 count_title, length_title, cumulative_title);
5549 } else {
5550 (void) printf("%7s%7s%7s",
5551 count_title, length_title, cumulative_title);
5552 }
5553 }
5554 (void) printf("\n");
5555
5556 /*
5557 * Print the rows
5558 */
5559 for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
5560
5561 /*
5562 * Print the first column showing the blocksize
5563 */
5564 zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
5565
5566 if (dump_opt['P']) {
5567 printf("%s", numbuf);
5568 } else {
5569 printf("%7s:", numbuf);
5570 }
5571
5572 /*
5573 * Print the remaining set of 3 columns per size:
5574 * for psize, lsize and asize
5575 */
5576 for (int j = 0; j < NUM_HISTO; j++) {
5577 parm_histo[j].cumulative += parm_histo[j].len[i];
5578
5579 zdb_nicenum(parm_histo[j].count[i],
5580 numbuf, sizeof (numbuf));
5581 if (dump_opt['P'])
5582 (void) printf("\t%s", numbuf);
5583 else
5584 (void) printf("%7s", numbuf);
5585
5586 zdb_nicenum(parm_histo[j].len[i],
5587 numbuf, sizeof (numbuf));
5588 if (dump_opt['P'])
5589 (void) printf("\t%s", numbuf);
5590 else
5591 (void) printf("%7s", numbuf);
5592
5593 zdb_nicenum(parm_histo[j].cumulative,
5594 numbuf, sizeof (numbuf));
5595 if (dump_opt['P'])
5596 (void) printf("\t%s", numbuf);
5597 else
5598 (void) printf("%7s", numbuf);
5599 }
5600 (void) printf("\n");
5601 }
5602 }
5603
5604 static void
zdb_count_block(zdb_cb_t * zcb,zilog_t * zilog,const blkptr_t * bp,dmu_object_type_t type)5605 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
5606 dmu_object_type_t type)
5607 {
5608 uint64_t refcnt = 0;
5609 int i;
5610
5611 ASSERT(type < ZDB_OT_TOTAL);
5612
5613 if (zilog && zil_bp_tree_add(zilog, bp) != 0)
5614 return;
5615
5616 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
5617
5618 for (i = 0; i < 4; i++) {
5619 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
5620 int t = (i & 1) ? type : ZDB_OT_TOTAL;
5621 int equal;
5622 zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
5623
5624 zb->zb_asize += BP_GET_ASIZE(bp);
5625 zb->zb_lsize += BP_GET_LSIZE(bp);
5626 zb->zb_psize += BP_GET_PSIZE(bp);
5627 zb->zb_count++;
5628
5629 /*
5630 * The histogram is only big enough to record blocks up to
5631 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
5632 * "other", bucket.
5633 */
5634 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
5635 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
5636 zb->zb_psize_histogram[idx]++;
5637
5638 zb->zb_gangs += BP_COUNT_GANG(bp);
5639
5640 switch (BP_GET_NDVAS(bp)) {
5641 case 2:
5642 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5643 DVA_GET_VDEV(&bp->blk_dva[1])) {
5644 zb->zb_ditto_samevdev++;
5645
5646 if (same_metaslab(zcb->zcb_spa,
5647 DVA_GET_VDEV(&bp->blk_dva[0]),
5648 DVA_GET_OFFSET(&bp->blk_dva[0]),
5649 DVA_GET_OFFSET(&bp->blk_dva[1])))
5650 zb->zb_ditto_same_ms++;
5651 }
5652 break;
5653 case 3:
5654 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5655 DVA_GET_VDEV(&bp->blk_dva[1])) +
5656 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5657 DVA_GET_VDEV(&bp->blk_dva[2])) +
5658 (DVA_GET_VDEV(&bp->blk_dva[1]) ==
5659 DVA_GET_VDEV(&bp->blk_dva[2]));
5660 if (equal != 0) {
5661 zb->zb_ditto_samevdev++;
5662
5663 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5664 DVA_GET_VDEV(&bp->blk_dva[1]) &&
5665 same_metaslab(zcb->zcb_spa,
5666 DVA_GET_VDEV(&bp->blk_dva[0]),
5667 DVA_GET_OFFSET(&bp->blk_dva[0]),
5668 DVA_GET_OFFSET(&bp->blk_dva[1])))
5669 zb->zb_ditto_same_ms++;
5670 else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5671 DVA_GET_VDEV(&bp->blk_dva[2]) &&
5672 same_metaslab(zcb->zcb_spa,
5673 DVA_GET_VDEV(&bp->blk_dva[0]),
5674 DVA_GET_OFFSET(&bp->blk_dva[0]),
5675 DVA_GET_OFFSET(&bp->blk_dva[2])))
5676 zb->zb_ditto_same_ms++;
5677 else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
5678 DVA_GET_VDEV(&bp->blk_dva[2]) &&
5679 same_metaslab(zcb->zcb_spa,
5680 DVA_GET_VDEV(&bp->blk_dva[1]),
5681 DVA_GET_OFFSET(&bp->blk_dva[1]),
5682 DVA_GET_OFFSET(&bp->blk_dva[2])))
5683 zb->zb_ditto_same_ms++;
5684 }
5685 break;
5686 }
5687 }
5688
5689 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
5690
5691 if (BP_IS_EMBEDDED(bp)) {
5692 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
5693 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
5694 [BPE_GET_PSIZE(bp)]++;
5695 return;
5696 }
5697 /*
5698 * The binning histogram bins by powers of two up to
5699 * SPA_MAXBLOCKSIZE rather than creating bins for
5700 * every possible blocksize found in the pool.
5701 */
5702 int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
5703
5704 zcb->zcb_psize_count[bin]++;
5705 zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
5706 zcb->zcb_psize_total += BP_GET_PSIZE(bp);
5707
5708 bin = highbit64(BP_GET_LSIZE(bp)) - 1;
5709
5710 zcb->zcb_lsize_count[bin]++;
5711 zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
5712 zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
5713
5714 bin = highbit64(BP_GET_ASIZE(bp)) - 1;
5715
5716 zcb->zcb_asize_count[bin]++;
5717 zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
5718 zcb->zcb_asize_total += BP_GET_ASIZE(bp);
5719
5720 if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
5721 /*
5722 * Cloned blocks are special. We need to count them, so we can
5723 * later uncount them when reporting leaked space, and we must
5724 * only claim them them once.
5725 *
5726 * To do this, we keep our own in-memory BRT. For each block
5727 * we haven't seen before, we look it up in the real BRT and
5728 * if its there, we note it and its refcount then proceed as
5729 * normal. If we see the block again, we count it as a clone
5730 * and then give it no further consideration.
5731 */
5732 zdb_brt_entry_t zbre_search, *zbre;
5733 avl_index_t where;
5734
5735 zbre_search.zbre_dva = bp->blk_dva[0];
5736 zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
5737 if (zbre != NULL) {
5738 zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
5739 zcb->zcb_clone_blocks++;
5740
5741 zbre->zbre_refcount--;
5742 if (zbre->zbre_refcount == 0) {
5743 avl_remove(&zcb->zcb_brt, zbre);
5744 umem_free(zbre, sizeof (zdb_brt_entry_t));
5745 }
5746 return;
5747 }
5748
5749 uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
5750 if (crefcnt > 0) {
5751 zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
5752 UMEM_NOFAIL);
5753 zbre->zbre_dva = bp->blk_dva[0];
5754 zbre->zbre_refcount = crefcnt;
5755 avl_insert(&zcb->zcb_brt, zbre, where);
5756 }
5757 }
5758
5759 if (dump_opt['L'])
5760 return;
5761
5762 if (BP_GET_DEDUP(bp)) {
5763 ddt_t *ddt;
5764 ddt_entry_t *dde;
5765
5766 ddt = ddt_select(zcb->zcb_spa, bp);
5767 ddt_enter(ddt);
5768 dde = ddt_lookup(ddt, bp, B_FALSE);
5769
5770 if (dde == NULL) {
5771 refcnt = 0;
5772 } else {
5773 ddt_phys_t *ddp = ddt_phys_select(dde, bp);
5774 ddt_phys_decref(ddp);
5775 refcnt = ddp->ddp_refcnt;
5776 if (ddt_phys_total_refcnt(dde) == 0)
5777 ddt_remove(ddt, dde);
5778 }
5779 ddt_exit(ddt);
5780 }
5781
5782 VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
5783 refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
5784 bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
5785 }
5786
5787 static void
zdb_blkptr_done(zio_t * zio)5788 zdb_blkptr_done(zio_t *zio)
5789 {
5790 spa_t *spa = zio->io_spa;
5791 blkptr_t *bp = zio->io_bp;
5792 int ioerr = zio->io_error;
5793 zdb_cb_t *zcb = zio->io_private;
5794 zbookmark_phys_t *zb = &zio->io_bookmark;
5795
5796 mutex_enter(&spa->spa_scrub_lock);
5797 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
5798 cv_broadcast(&spa->spa_scrub_io_cv);
5799
5800 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
5801 char blkbuf[BP_SPRINTF_LEN];
5802
5803 zcb->zcb_haderrors = 1;
5804 zcb->zcb_errors[ioerr]++;
5805
5806 if (dump_opt['b'] >= 2)
5807 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5808 else
5809 blkbuf[0] = '\0';
5810
5811 (void) printf("zdb_blkptr_cb: "
5812 "Got error %d reading "
5813 "<%llu, %llu, %lld, %llx> %s -- skipping\n",
5814 ioerr,
5815 (u_longlong_t)zb->zb_objset,
5816 (u_longlong_t)zb->zb_object,
5817 (u_longlong_t)zb->zb_level,
5818 (u_longlong_t)zb->zb_blkid,
5819 blkbuf);
5820 }
5821 mutex_exit(&spa->spa_scrub_lock);
5822
5823 abd_free(zio->io_abd);
5824 }
5825
5826 static int
zdb_blkptr_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)5827 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5828 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
5829 {
5830 zdb_cb_t *zcb = arg;
5831 dmu_object_type_t type;
5832 boolean_t is_metadata;
5833
5834 if (zb->zb_level == ZB_DNODE_LEVEL)
5835 return (0);
5836
5837 if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
5838 char blkbuf[BP_SPRINTF_LEN];
5839 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5840 (void) printf("objset %llu object %llu "
5841 "level %lld offset 0x%llx %s\n",
5842 (u_longlong_t)zb->zb_objset,
5843 (u_longlong_t)zb->zb_object,
5844 (longlong_t)zb->zb_level,
5845 (u_longlong_t)blkid2offset(dnp, bp, zb),
5846 blkbuf);
5847 }
5848
5849 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
5850 return (0);
5851
5852 type = BP_GET_TYPE(bp);
5853
5854 zdb_count_block(zcb, zilog, bp,
5855 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
5856
5857 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
5858
5859 if (!BP_IS_EMBEDDED(bp) &&
5860 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
5861 size_t size = BP_GET_PSIZE(bp);
5862 abd_t *abd = abd_alloc(size, B_FALSE);
5863 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
5864
5865 /* If it's an intent log block, failure is expected. */
5866 if (zb->zb_level == ZB_ZIL_LEVEL)
5867 flags |= ZIO_FLAG_SPECULATIVE;
5868
5869 mutex_enter(&spa->spa_scrub_lock);
5870 while (spa->spa_load_verify_bytes > max_inflight_bytes)
5871 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
5872 spa->spa_load_verify_bytes += size;
5873 mutex_exit(&spa->spa_scrub_lock);
5874
5875 zio_nowait(zio_read(NULL, spa, bp, abd, size,
5876 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
5877 }
5878
5879 zcb->zcb_readfails = 0;
5880
5881 /* only call gethrtime() every 100 blocks */
5882 static int iters;
5883 if (++iters > 100)
5884 iters = 0;
5885 else
5886 return (0);
5887
5888 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
5889 uint64_t now = gethrtime();
5890 char buf[10];
5891 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
5892 uint64_t kb_per_sec =
5893 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
5894 uint64_t sec_remaining =
5895 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
5896
5897 /* make sure nicenum has enough space */
5898 _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
5899
5900 zfs_nicebytes(bytes, buf, sizeof (buf));
5901 (void) fprintf(stderr,
5902 "\r%5s completed (%4"PRIu64"MB/s) "
5903 "estimated time remaining: "
5904 "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ",
5905 buf, kb_per_sec / 1024,
5906 sec_remaining / 60 / 60,
5907 sec_remaining / 60 % 60,
5908 sec_remaining % 60);
5909
5910 zcb->zcb_lastprint = now;
5911 }
5912
5913 return (0);
5914 }
5915
5916 static void
zdb_leak(void * arg,uint64_t start,uint64_t size)5917 zdb_leak(void *arg, uint64_t start, uint64_t size)
5918 {
5919 vdev_t *vd = arg;
5920
5921 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
5922 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
5923 }
5924
5925 static metaslab_ops_t zdb_metaslab_ops = {
5926 NULL /* alloc */
5927 };
5928
5929 static int
load_unflushed_svr_segs_cb(spa_t * spa,space_map_entry_t * sme,uint64_t txg,void * arg)5930 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
5931 uint64_t txg, void *arg)
5932 {
5933 spa_vdev_removal_t *svr = arg;
5934
5935 uint64_t offset = sme->sme_offset;
5936 uint64_t size = sme->sme_run;
5937
5938 /* skip vdevs we don't care about */
5939 if (sme->sme_vdev != svr->svr_vdev_id)
5940 return (0);
5941
5942 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
5943 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5944 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
5945
5946 if (txg < metaslab_unflushed_txg(ms))
5947 return (0);
5948
5949 if (sme->sme_type == SM_ALLOC)
5950 range_tree_add(svr->svr_allocd_segs, offset, size);
5951 else
5952 range_tree_remove(svr->svr_allocd_segs, offset, size);
5953
5954 return (0);
5955 }
5956
5957 static void
claim_segment_impl_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)5958 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5959 uint64_t size, void *arg)
5960 {
5961 (void) inner_offset, (void) arg;
5962
5963 /*
5964 * This callback was called through a remap from
5965 * a device being removed. Therefore, the vdev that
5966 * this callback is applied to is a concrete
5967 * vdev.
5968 */
5969 ASSERT(vdev_is_concrete(vd));
5970
5971 VERIFY0(metaslab_claim_impl(vd, offset, size,
5972 spa_min_claim_txg(vd->vdev_spa)));
5973 }
5974
5975 static void
claim_segment_cb(void * arg,uint64_t offset,uint64_t size)5976 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
5977 {
5978 vdev_t *vd = arg;
5979
5980 vdev_indirect_ops.vdev_op_remap(vd, offset, size,
5981 claim_segment_impl_cb, NULL);
5982 }
5983
5984 /*
5985 * After accounting for all allocated blocks that are directly referenced,
5986 * we might have missed a reference to a block from a partially complete
5987 * (and thus unused) indirect mapping object. We perform a secondary pass
5988 * through the metaslabs we have already mapped and claim the destination
5989 * blocks.
5990 */
5991 static void
zdb_claim_removing(spa_t * spa,zdb_cb_t * zcb)5992 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
5993 {
5994 if (dump_opt['L'])
5995 return;
5996
5997 if (spa->spa_vdev_removal == NULL)
5998 return;
5999
6000 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6001
6002 spa_vdev_removal_t *svr = spa->spa_vdev_removal;
6003 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
6004 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6005
6006 ASSERT0(range_tree_space(svr->svr_allocd_segs));
6007
6008 range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
6009 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
6010 metaslab_t *msp = vd->vdev_ms[msi];
6011
6012 ASSERT0(range_tree_space(allocs));
6013 if (msp->ms_sm != NULL)
6014 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
6015 range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
6016 }
6017 range_tree_destroy(allocs);
6018
6019 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
6020
6021 /*
6022 * Clear everything past what has been synced,
6023 * because we have not allocated mappings for
6024 * it yet.
6025 */
6026 range_tree_clear(svr->svr_allocd_segs,
6027 vdev_indirect_mapping_max_offset(vim),
6028 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
6029
6030 zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
6031 range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
6032
6033 spa_config_exit(spa, SCL_CONFIG, FTAG);
6034 }
6035
6036 static int
increment_indirect_mapping_cb(void * arg,const blkptr_t * bp,boolean_t bp_freed,dmu_tx_t * tx)6037 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
6038 dmu_tx_t *tx)
6039 {
6040 (void) tx;
6041 zdb_cb_t *zcb = arg;
6042 spa_t *spa = zcb->zcb_spa;
6043 vdev_t *vd;
6044 const dva_t *dva = &bp->blk_dva[0];
6045
6046 ASSERT(!bp_freed);
6047 ASSERT(!dump_opt['L']);
6048 ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
6049
6050 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
6051 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
6052 ASSERT3P(vd, !=, NULL);
6053 spa_config_exit(spa, SCL_VDEV, FTAG);
6054
6055 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
6056 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
6057
6058 vdev_indirect_mapping_increment_obsolete_count(
6059 vd->vdev_indirect_mapping,
6060 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
6061 zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
6062
6063 return (0);
6064 }
6065
6066 static uint32_t *
zdb_load_obsolete_counts(vdev_t * vd)6067 zdb_load_obsolete_counts(vdev_t *vd)
6068 {
6069 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6070 spa_t *spa = vd->vdev_spa;
6071 spa_condensing_indirect_phys_t *scip =
6072 &spa->spa_condensing_indirect_phys;
6073 uint64_t obsolete_sm_object;
6074 uint32_t *counts;
6075
6076 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
6077 EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
6078 counts = vdev_indirect_mapping_load_obsolete_counts(vim);
6079 if (vd->vdev_obsolete_sm != NULL) {
6080 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
6081 vd->vdev_obsolete_sm);
6082 }
6083 if (scip->scip_vdev == vd->vdev_id &&
6084 scip->scip_prev_obsolete_sm_object != 0) {
6085 space_map_t *prev_obsolete_sm = NULL;
6086 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
6087 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
6088 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
6089 prev_obsolete_sm);
6090 space_map_close(prev_obsolete_sm);
6091 }
6092 return (counts);
6093 }
6094
6095 static void
zdb_ddt_leak_init(spa_t * spa,zdb_cb_t * zcb)6096 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
6097 {
6098 ddt_bookmark_t ddb = {0};
6099 ddt_entry_t dde;
6100 int error;
6101 int p;
6102
6103 ASSERT(!dump_opt['L']);
6104
6105 while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
6106 blkptr_t blk;
6107 ddt_phys_t *ddp = dde.dde_phys;
6108
6109 if (ddb.ddb_class == DDT_CLASS_UNIQUE)
6110 return;
6111
6112 ASSERT(ddt_phys_total_refcnt(&dde) > 1);
6113 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
6114 VERIFY(ddt);
6115
6116 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
6117 if (ddp->ddp_phys_birth == 0)
6118 continue;
6119 ddt_bp_create(ddb.ddb_checksum,
6120 &dde.dde_key, ddp, &blk);
6121 if (p == DDT_PHYS_DITTO) {
6122 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
6123 } else {
6124 zcb->zcb_dedup_asize +=
6125 BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
6126 zcb->zcb_dedup_blocks++;
6127 }
6128 }
6129
6130 ddt_enter(ddt);
6131 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
6132 ddt_exit(ddt);
6133 }
6134
6135 ASSERT(error == ENOENT);
6136 }
6137
6138 typedef struct checkpoint_sm_exclude_entry_arg {
6139 vdev_t *cseea_vd;
6140 uint64_t cseea_checkpoint_size;
6141 } checkpoint_sm_exclude_entry_arg_t;
6142
6143 static int
checkpoint_sm_exclude_entry_cb(space_map_entry_t * sme,void * arg)6144 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
6145 {
6146 checkpoint_sm_exclude_entry_arg_t *cseea = arg;
6147 vdev_t *vd = cseea->cseea_vd;
6148 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
6149 uint64_t end = sme->sme_offset + sme->sme_run;
6150
6151 ASSERT(sme->sme_type == SM_FREE);
6152
6153 /*
6154 * Since the vdev_checkpoint_sm exists in the vdev level
6155 * and the ms_sm space maps exist in the metaslab level,
6156 * an entry in the checkpoint space map could theoretically
6157 * cross the boundaries of the metaslab that it belongs.
6158 *
6159 * In reality, because of the way that we populate and
6160 * manipulate the checkpoint's space maps currently,
6161 * there shouldn't be any entries that cross metaslabs.
6162 * Hence the assertion below.
6163 *
6164 * That said, there is no fundamental requirement that
6165 * the checkpoint's space map entries should not cross
6166 * metaslab boundaries. So if needed we could add code
6167 * that handles metaslab-crossing segments in the future.
6168 */
6169 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
6170 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
6171
6172 /*
6173 * By removing the entry from the allocated segments we
6174 * also verify that the entry is there to begin with.
6175 */
6176 mutex_enter(&ms->ms_lock);
6177 range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
6178 mutex_exit(&ms->ms_lock);
6179
6180 cseea->cseea_checkpoint_size += sme->sme_run;
6181 return (0);
6182 }
6183
6184 static void
zdb_leak_init_vdev_exclude_checkpoint(vdev_t * vd,zdb_cb_t * zcb)6185 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
6186 {
6187 spa_t *spa = vd->vdev_spa;
6188 space_map_t *checkpoint_sm = NULL;
6189 uint64_t checkpoint_sm_obj;
6190
6191 /*
6192 * If there is no vdev_top_zap, we are in a pool whose
6193 * version predates the pool checkpoint feature.
6194 */
6195 if (vd->vdev_top_zap == 0)
6196 return;
6197
6198 /*
6199 * If there is no reference of the vdev_checkpoint_sm in
6200 * the vdev_top_zap, then one of the following scenarios
6201 * is true:
6202 *
6203 * 1] There is no checkpoint
6204 * 2] There is a checkpoint, but no checkpointed blocks
6205 * have been freed yet
6206 * 3] The current vdev is indirect
6207 *
6208 * In these cases we return immediately.
6209 */
6210 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
6211 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
6212 return;
6213
6214 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
6215 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
6216 &checkpoint_sm_obj));
6217
6218 checkpoint_sm_exclude_entry_arg_t cseea;
6219 cseea.cseea_vd = vd;
6220 cseea.cseea_checkpoint_size = 0;
6221
6222 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
6223 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
6224
6225 VERIFY0(space_map_iterate(checkpoint_sm,
6226 space_map_length(checkpoint_sm),
6227 checkpoint_sm_exclude_entry_cb, &cseea));
6228 space_map_close(checkpoint_sm);
6229
6230 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
6231 }
6232
6233 static void
zdb_leak_init_exclude_checkpoint(spa_t * spa,zdb_cb_t * zcb)6234 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
6235 {
6236 ASSERT(!dump_opt['L']);
6237
6238 vdev_t *rvd = spa->spa_root_vdev;
6239 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
6240 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
6241 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
6242 }
6243 }
6244
6245 static int
count_unflushed_space_cb(spa_t * spa,space_map_entry_t * sme,uint64_t txg,void * arg)6246 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
6247 uint64_t txg, void *arg)
6248 {
6249 int64_t *ualloc_space = arg;
6250
6251 uint64_t offset = sme->sme_offset;
6252 uint64_t vdev_id = sme->sme_vdev;
6253
6254 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6255 if (!vdev_is_concrete(vd))
6256 return (0);
6257
6258 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6259 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6260
6261 if (txg < metaslab_unflushed_txg(ms))
6262 return (0);
6263
6264 if (sme->sme_type == SM_ALLOC)
6265 *ualloc_space += sme->sme_run;
6266 else
6267 *ualloc_space -= sme->sme_run;
6268
6269 return (0);
6270 }
6271
6272 static int64_t
get_unflushed_alloc_space(spa_t * spa)6273 get_unflushed_alloc_space(spa_t *spa)
6274 {
6275 if (dump_opt['L'])
6276 return (0);
6277
6278 int64_t ualloc_space = 0;
6279 iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
6280 &ualloc_space);
6281 return (ualloc_space);
6282 }
6283
6284 static int
load_unflushed_cb(spa_t * spa,space_map_entry_t * sme,uint64_t txg,void * arg)6285 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
6286 {
6287 maptype_t *uic_maptype = arg;
6288
6289 uint64_t offset = sme->sme_offset;
6290 uint64_t size = sme->sme_run;
6291 uint64_t vdev_id = sme->sme_vdev;
6292
6293 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6294
6295 /* skip indirect vdevs */
6296 if (!vdev_is_concrete(vd))
6297 return (0);
6298
6299 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6300
6301 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6302 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
6303
6304 if (txg < metaslab_unflushed_txg(ms))
6305 return (0);
6306
6307 if (*uic_maptype == sme->sme_type)
6308 range_tree_add(ms->ms_allocatable, offset, size);
6309 else
6310 range_tree_remove(ms->ms_allocatable, offset, size);
6311
6312 return (0);
6313 }
6314
6315 static void
load_unflushed_to_ms_allocatables(spa_t * spa,maptype_t maptype)6316 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
6317 {
6318 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
6319 }
6320
6321 static void
load_concrete_ms_allocatable_trees(spa_t * spa,maptype_t maptype)6322 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
6323 {
6324 vdev_t *rvd = spa->spa_root_vdev;
6325 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
6326 vdev_t *vd = rvd->vdev_child[i];
6327
6328 ASSERT3U(i, ==, vd->vdev_id);
6329
6330 if (vd->vdev_ops == &vdev_indirect_ops)
6331 continue;
6332
6333 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6334 metaslab_t *msp = vd->vdev_ms[m];
6335
6336 (void) fprintf(stderr,
6337 "\rloading concrete vdev %llu, "
6338 "metaslab %llu of %llu ...",
6339 (longlong_t)vd->vdev_id,
6340 (longlong_t)msp->ms_id,
6341 (longlong_t)vd->vdev_ms_count);
6342
6343 mutex_enter(&msp->ms_lock);
6344 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
6345
6346 /*
6347 * We don't want to spend the CPU manipulating the
6348 * size-ordered tree, so clear the range_tree ops.
6349 */
6350 msp->ms_allocatable->rt_ops = NULL;
6351
6352 if (msp->ms_sm != NULL) {
6353 VERIFY0(space_map_load(msp->ms_sm,
6354 msp->ms_allocatable, maptype));
6355 }
6356 if (!msp->ms_loaded)
6357 msp->ms_loaded = B_TRUE;
6358 mutex_exit(&msp->ms_lock);
6359 }
6360 }
6361
6362 load_unflushed_to_ms_allocatables(spa, maptype);
6363 }
6364
6365 /*
6366 * vm_idxp is an in-out parameter which (for indirect vdevs) is the
6367 * index in vim_entries that has the first entry in this metaslab.
6368 * On return, it will be set to the first entry after this metaslab.
6369 */
6370 static void
load_indirect_ms_allocatable_tree(vdev_t * vd,metaslab_t * msp,uint64_t * vim_idxp)6371 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
6372 uint64_t *vim_idxp)
6373 {
6374 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6375
6376 mutex_enter(&msp->ms_lock);
6377 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
6378
6379 /*
6380 * We don't want to spend the CPU manipulating the
6381 * size-ordered tree, so clear the range_tree ops.
6382 */
6383 msp->ms_allocatable->rt_ops = NULL;
6384
6385 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
6386 (*vim_idxp)++) {
6387 vdev_indirect_mapping_entry_phys_t *vimep =
6388 &vim->vim_entries[*vim_idxp];
6389 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
6390 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
6391 ASSERT3U(ent_offset, >=, msp->ms_start);
6392 if (ent_offset >= msp->ms_start + msp->ms_size)
6393 break;
6394
6395 /*
6396 * Mappings do not cross metaslab boundaries,
6397 * because we create them by walking the metaslabs.
6398 */
6399 ASSERT3U(ent_offset + ent_len, <=,
6400 msp->ms_start + msp->ms_size);
6401 range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
6402 }
6403
6404 if (!msp->ms_loaded)
6405 msp->ms_loaded = B_TRUE;
6406 mutex_exit(&msp->ms_lock);
6407 }
6408
6409 static void
zdb_leak_init_prepare_indirect_vdevs(spa_t * spa,zdb_cb_t * zcb)6410 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
6411 {
6412 ASSERT(!dump_opt['L']);
6413
6414 vdev_t *rvd = spa->spa_root_vdev;
6415 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
6416 vdev_t *vd = rvd->vdev_child[c];
6417
6418 ASSERT3U(c, ==, vd->vdev_id);
6419
6420 if (vd->vdev_ops != &vdev_indirect_ops)
6421 continue;
6422
6423 /*
6424 * Note: we don't check for mapping leaks on
6425 * removing vdevs because their ms_allocatable's
6426 * are used to look for leaks in allocated space.
6427 */
6428 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
6429
6430 /*
6431 * Normally, indirect vdevs don't have any
6432 * metaslabs. We want to set them up for
6433 * zio_claim().
6434 */
6435 vdev_metaslab_group_create(vd);
6436 VERIFY0(vdev_metaslab_init(vd, 0));
6437
6438 vdev_indirect_mapping_t *vim __maybe_unused =
6439 vd->vdev_indirect_mapping;
6440 uint64_t vim_idx = 0;
6441 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6442
6443 (void) fprintf(stderr,
6444 "\rloading indirect vdev %llu, "
6445 "metaslab %llu of %llu ...",
6446 (longlong_t)vd->vdev_id,
6447 (longlong_t)vd->vdev_ms[m]->ms_id,
6448 (longlong_t)vd->vdev_ms_count);
6449
6450 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
6451 &vim_idx);
6452 }
6453 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
6454 }
6455 }
6456
6457 static void
zdb_leak_init(spa_t * spa,zdb_cb_t * zcb)6458 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
6459 {
6460 zcb->zcb_spa = spa;
6461
6462 if (dump_opt['L'])
6463 return;
6464
6465 dsl_pool_t *dp = spa->spa_dsl_pool;
6466 vdev_t *rvd = spa->spa_root_vdev;
6467
6468 /*
6469 * We are going to be changing the meaning of the metaslab's
6470 * ms_allocatable. Ensure that the allocator doesn't try to
6471 * use the tree.
6472 */
6473 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
6474 spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
6475 spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
6476
6477 zcb->zcb_vd_obsolete_counts =
6478 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
6479 UMEM_NOFAIL);
6480
6481 /*
6482 * For leak detection, we overload the ms_allocatable trees
6483 * to contain allocated segments instead of free segments.
6484 * As a result, we can't use the normal metaslab_load/unload
6485 * interfaces.
6486 */
6487 zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
6488 load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
6489
6490 /*
6491 * On load_concrete_ms_allocatable_trees() we loaded all the
6492 * allocated entries from the ms_sm to the ms_allocatable for
6493 * each metaslab. If the pool has a checkpoint or is in the
6494 * middle of discarding a checkpoint, some of these blocks
6495 * may have been freed but their ms_sm may not have been
6496 * updated because they are referenced by the checkpoint. In
6497 * order to avoid false-positives during leak-detection, we
6498 * go through the vdev's checkpoint space map and exclude all
6499 * its entries from their relevant ms_allocatable.
6500 *
6501 * We also aggregate the space held by the checkpoint and add
6502 * it to zcb_checkpoint_size.
6503 *
6504 * Note that at this point we are also verifying that all the
6505 * entries on the checkpoint_sm are marked as allocated in
6506 * the ms_sm of their relevant metaslab.
6507 * [see comment in checkpoint_sm_exclude_entry_cb()]
6508 */
6509 zdb_leak_init_exclude_checkpoint(spa, zcb);
6510 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
6511
6512 /* for cleaner progress output */
6513 (void) fprintf(stderr, "\n");
6514
6515 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
6516 ASSERT(spa_feature_is_enabled(spa,
6517 SPA_FEATURE_DEVICE_REMOVAL));
6518 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
6519 increment_indirect_mapping_cb, zcb, NULL);
6520 }
6521
6522 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6523 zdb_ddt_leak_init(spa, zcb);
6524 spa_config_exit(spa, SCL_CONFIG, FTAG);
6525 }
6526
6527 static boolean_t
zdb_check_for_obsolete_leaks(vdev_t * vd,zdb_cb_t * zcb)6528 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
6529 {
6530 boolean_t leaks = B_FALSE;
6531 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6532 uint64_t total_leaked = 0;
6533 boolean_t are_precise = B_FALSE;
6534
6535 ASSERT(vim != NULL);
6536
6537 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
6538 vdev_indirect_mapping_entry_phys_t *vimep =
6539 &vim->vim_entries[i];
6540 uint64_t obsolete_bytes = 0;
6541 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
6542 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6543
6544 /*
6545 * This is not very efficient but it's easy to
6546 * verify correctness.
6547 */
6548 for (uint64_t inner_offset = 0;
6549 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
6550 inner_offset += 1ULL << vd->vdev_ashift) {
6551 if (range_tree_contains(msp->ms_allocatable,
6552 offset + inner_offset, 1ULL << vd->vdev_ashift)) {
6553 obsolete_bytes += 1ULL << vd->vdev_ashift;
6554 }
6555 }
6556
6557 int64_t bytes_leaked = obsolete_bytes -
6558 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
6559 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
6560 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
6561
6562 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
6563 if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
6564 (void) printf("obsolete indirect mapping count "
6565 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
6566 (u_longlong_t)vd->vdev_id,
6567 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
6568 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
6569 (u_longlong_t)bytes_leaked);
6570 }
6571 total_leaked += ABS(bytes_leaked);
6572 }
6573
6574 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
6575 if (!are_precise && total_leaked > 0) {
6576 int pct_leaked = total_leaked * 100 /
6577 vdev_indirect_mapping_bytes_mapped(vim);
6578 (void) printf("cannot verify obsolete indirect mapping "
6579 "counts of vdev %llu because precise feature was not "
6580 "enabled when it was removed: %d%% (%llx bytes) of mapping"
6581 "unreferenced\n",
6582 (u_longlong_t)vd->vdev_id, pct_leaked,
6583 (u_longlong_t)total_leaked);
6584 } else if (total_leaked > 0) {
6585 (void) printf("obsolete indirect mapping count mismatch "
6586 "for vdev %llu -- %llx total bytes mismatched\n",
6587 (u_longlong_t)vd->vdev_id,
6588 (u_longlong_t)total_leaked);
6589 leaks |= B_TRUE;
6590 }
6591
6592 vdev_indirect_mapping_free_obsolete_counts(vim,
6593 zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
6594 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
6595
6596 return (leaks);
6597 }
6598
6599 static boolean_t
zdb_leak_fini(spa_t * spa,zdb_cb_t * zcb)6600 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
6601 {
6602 if (dump_opt['L'])
6603 return (B_FALSE);
6604
6605 boolean_t leaks = B_FALSE;
6606 vdev_t *rvd = spa->spa_root_vdev;
6607 for (unsigned c = 0; c < rvd->vdev_children; c++) {
6608 vdev_t *vd = rvd->vdev_child[c];
6609
6610 if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
6611 leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
6612 }
6613
6614 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6615 metaslab_t *msp = vd->vdev_ms[m];
6616 ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
6617 spa_embedded_log_class(spa)) ?
6618 vd->vdev_log_mg : vd->vdev_mg);
6619
6620 /*
6621 * ms_allocatable has been overloaded
6622 * to contain allocated segments. Now that
6623 * we finished traversing all blocks, any
6624 * block that remains in the ms_allocatable
6625 * represents an allocated block that we
6626 * did not claim during the traversal.
6627 * Claimed blocks would have been removed
6628 * from the ms_allocatable. For indirect
6629 * vdevs, space remaining in the tree
6630 * represents parts of the mapping that are
6631 * not referenced, which is not a bug.
6632 */
6633 if (vd->vdev_ops == &vdev_indirect_ops) {
6634 range_tree_vacate(msp->ms_allocatable,
6635 NULL, NULL);
6636 } else {
6637 range_tree_vacate(msp->ms_allocatable,
6638 zdb_leak, vd);
6639 }
6640 if (msp->ms_loaded) {
6641 msp->ms_loaded = B_FALSE;
6642 }
6643 }
6644 }
6645
6646 umem_free(zcb->zcb_vd_obsolete_counts,
6647 rvd->vdev_children * sizeof (uint32_t *));
6648 zcb->zcb_vd_obsolete_counts = NULL;
6649
6650 return (leaks);
6651 }
6652
6653 static int
count_block_cb(void * arg,const blkptr_t * bp,dmu_tx_t * tx)6654 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6655 {
6656 (void) tx;
6657 zdb_cb_t *zcb = arg;
6658
6659 if (dump_opt['b'] >= 5) {
6660 char blkbuf[BP_SPRINTF_LEN];
6661 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
6662 (void) printf("[%s] %s\n",
6663 "deferred free", blkbuf);
6664 }
6665 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
6666 return (0);
6667 }
6668
6669 /*
6670 * Iterate over livelists which have been destroyed by the user but
6671 * are still present in the MOS, waiting to be freed
6672 */
6673 static void
iterate_deleted_livelists(spa_t * spa,ll_iter_t func,void * arg)6674 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
6675 {
6676 objset_t *mos = spa->spa_meta_objset;
6677 uint64_t zap_obj;
6678 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
6679 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
6680 if (err == ENOENT)
6681 return;
6682 ASSERT0(err);
6683
6684 zap_cursor_t zc;
6685 zap_attribute_t attr;
6686 dsl_deadlist_t ll;
6687 /* NULL out os prior to dsl_deadlist_open in case it's garbage */
6688 ll.dl_os = NULL;
6689 for (zap_cursor_init(&zc, mos, zap_obj);
6690 zap_cursor_retrieve(&zc, &attr) == 0;
6691 (void) zap_cursor_advance(&zc)) {
6692 dsl_deadlist_open(&ll, mos, attr.za_first_integer);
6693 func(&ll, arg);
6694 dsl_deadlist_close(&ll);
6695 }
6696 zap_cursor_fini(&zc);
6697 }
6698
6699 static int
bpobj_count_block_cb(void * arg,const blkptr_t * bp,boolean_t bp_freed,dmu_tx_t * tx)6700 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
6701 dmu_tx_t *tx)
6702 {
6703 ASSERT(!bp_freed);
6704 return (count_block_cb(arg, bp, tx));
6705 }
6706
6707 static int
livelist_entry_count_blocks_cb(void * args,dsl_deadlist_entry_t * dle)6708 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
6709 {
6710 zdb_cb_t *zbc = args;
6711 bplist_t blks;
6712 bplist_create(&blks);
6713 /* determine which blocks have been alloc'd but not freed */
6714 VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
6715 /* count those blocks */
6716 (void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
6717 bplist_destroy(&blks);
6718 return (0);
6719 }
6720
6721 static void
livelist_count_blocks(dsl_deadlist_t * ll,void * arg)6722 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
6723 {
6724 dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
6725 }
6726
6727 /*
6728 * Count the blocks in the livelists that have been destroyed by the user
6729 * but haven't yet been freed.
6730 */
6731 static void
deleted_livelists_count_blocks(spa_t * spa,zdb_cb_t * zbc)6732 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
6733 {
6734 iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
6735 }
6736
6737 static void
dump_livelist_cb(dsl_deadlist_t * ll,void * arg)6738 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
6739 {
6740 ASSERT3P(arg, ==, NULL);
6741 global_feature_count[SPA_FEATURE_LIVELIST]++;
6742 dump_blkptr_list(ll, "Deleted Livelist");
6743 dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
6744 }
6745
6746 /*
6747 * Print out, register object references to, and increment feature counts for
6748 * livelists that have been destroyed by the user but haven't yet been freed.
6749 */
6750 static void
deleted_livelists_dump_mos(spa_t * spa)6751 deleted_livelists_dump_mos(spa_t *spa)
6752 {
6753 uint64_t zap_obj;
6754 objset_t *mos = spa->spa_meta_objset;
6755 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
6756 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
6757 if (err == ENOENT)
6758 return;
6759 mos_obj_refd(zap_obj);
6760 iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
6761 }
6762
6763 static int
zdb_brt_entry_compare(const void * zcn1,const void * zcn2)6764 zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
6765 {
6766 const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
6767 const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
6768 int cmp;
6769
6770 cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
6771 if (cmp == 0)
6772 cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
6773
6774 return (cmp);
6775 }
6776
6777 static int
dump_block_stats(spa_t * spa)6778 dump_block_stats(spa_t *spa)
6779 {
6780 zdb_cb_t *zcb;
6781 zdb_blkstats_t *zb, *tzb;
6782 uint64_t norm_alloc, norm_space, total_alloc, total_found;
6783 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
6784 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
6785 boolean_t leaks = B_FALSE;
6786 int e, c, err;
6787 bp_embedded_type_t i;
6788
6789 zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
6790
6791 if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
6792 avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
6793 sizeof (zdb_brt_entry_t),
6794 offsetof(zdb_brt_entry_t, zbre_node));
6795 zcb->zcb_brt_is_active = B_TRUE;
6796 }
6797
6798 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
6799 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
6800 (dump_opt['c'] == 1) ? "metadata " : "",
6801 dump_opt['c'] ? "checksums " : "",
6802 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
6803 !dump_opt['L'] ? "nothing leaked " : "");
6804
6805 /*
6806 * When leak detection is enabled we load all space maps as SM_ALLOC
6807 * maps, then traverse the pool claiming each block we discover. If
6808 * the pool is perfectly consistent, the segment trees will be empty
6809 * when we're done. Anything left over is a leak; any block we can't
6810 * claim (because it's not part of any space map) is a double
6811 * allocation, reference to a freed block, or an unclaimed log block.
6812 *
6813 * When leak detection is disabled (-L option) we still traverse the
6814 * pool claiming each block we discover, but we skip opening any space
6815 * maps.
6816 */
6817 zdb_leak_init(spa, zcb);
6818
6819 /*
6820 * If there's a deferred-free bplist, process that first.
6821 */
6822 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
6823 bpobj_count_block_cb, zcb, NULL);
6824
6825 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
6826 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
6827 bpobj_count_block_cb, zcb, NULL);
6828 }
6829
6830 zdb_claim_removing(spa, zcb);
6831
6832 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
6833 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
6834 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
6835 zcb, NULL));
6836 }
6837
6838 deleted_livelists_count_blocks(spa, zcb);
6839
6840 if (dump_opt['c'] > 1)
6841 flags |= TRAVERSE_PREFETCH_DATA;
6842
6843 zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
6844 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
6845 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
6846 zcb->zcb_totalasize +=
6847 metaslab_class_get_alloc(spa_embedded_log_class(spa));
6848 zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
6849 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
6850
6851 /*
6852 * If we've traversed the data blocks then we need to wait for those
6853 * I/Os to complete. We leverage "The Godfather" zio to wait on
6854 * all async I/Os to complete.
6855 */
6856 if (dump_opt['c']) {
6857 for (c = 0; c < max_ncpus; c++) {
6858 (void) zio_wait(spa->spa_async_zio_root[c]);
6859 spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
6860 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
6861 ZIO_FLAG_GODFATHER);
6862 }
6863 }
6864 ASSERT0(spa->spa_load_verify_bytes);
6865
6866 /*
6867 * Done after zio_wait() since zcb_haderrors is modified in
6868 * zdb_blkptr_done()
6869 */
6870 zcb->zcb_haderrors |= err;
6871
6872 if (zcb->zcb_haderrors) {
6873 (void) printf("\nError counts:\n\n");
6874 (void) printf("\t%5s %s\n", "errno", "count");
6875 for (e = 0; e < 256; e++) {
6876 if (zcb->zcb_errors[e] != 0) {
6877 (void) printf("\t%5d %llu\n",
6878 e, (u_longlong_t)zcb->zcb_errors[e]);
6879 }
6880 }
6881 }
6882
6883 /*
6884 * Report any leaked segments.
6885 */
6886 leaks |= zdb_leak_fini(spa, zcb);
6887
6888 tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
6889
6890 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
6891 norm_space = metaslab_class_get_space(spa_normal_class(spa));
6892
6893 total_alloc = norm_alloc +
6894 metaslab_class_get_alloc(spa_log_class(spa)) +
6895 metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
6896 metaslab_class_get_alloc(spa_special_class(spa)) +
6897 metaslab_class_get_alloc(spa_dedup_class(spa)) +
6898 get_unflushed_alloc_space(spa);
6899 total_found =
6900 tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
6901 zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
6902
6903 if (total_found == total_alloc && !dump_opt['L']) {
6904 (void) printf("\n\tNo leaks (block sum matches space"
6905 " maps exactly)\n");
6906 } else if (!dump_opt['L']) {
6907 (void) printf("block traversal size %llu != alloc %llu "
6908 "(%s %lld)\n",
6909 (u_longlong_t)total_found,
6910 (u_longlong_t)total_alloc,
6911 (dump_opt['L']) ? "unreachable" : "leaked",
6912 (longlong_t)(total_alloc - total_found));
6913 leaks = B_TRUE;
6914 }
6915
6916 if (tzb->zb_count == 0) {
6917 umem_free(zcb, sizeof (zdb_cb_t));
6918 return (2);
6919 }
6920
6921 (void) printf("\n");
6922 (void) printf("\t%-16s %14llu\n", "bp count:",
6923 (u_longlong_t)tzb->zb_count);
6924 (void) printf("\t%-16s %14llu\n", "ganged count:",
6925 (longlong_t)tzb->zb_gangs);
6926 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
6927 (u_longlong_t)tzb->zb_lsize,
6928 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
6929 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
6930 "bp physical:", (u_longlong_t)tzb->zb_psize,
6931 (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
6932 (double)tzb->zb_lsize / tzb->zb_psize);
6933 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
6934 "bp allocated:", (u_longlong_t)tzb->zb_asize,
6935 (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
6936 (double)tzb->zb_lsize / tzb->zb_asize);
6937 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
6938 "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
6939 (u_longlong_t)zcb->zcb_dedup_blocks,
6940 (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
6941 (void) printf("\t%-16s %14llu count: %6llu\n",
6942 "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
6943 (u_longlong_t)zcb->zcb_clone_blocks);
6944 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
6945 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
6946
6947 if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6948 uint64_t alloc = metaslab_class_get_alloc(
6949 spa_special_class(spa));
6950 uint64_t space = metaslab_class_get_space(
6951 spa_special_class(spa));
6952
6953 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
6954 "Special class", (u_longlong_t)alloc,
6955 100.0 * alloc / space);
6956 }
6957
6958 if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6959 uint64_t alloc = metaslab_class_get_alloc(
6960 spa_dedup_class(spa));
6961 uint64_t space = metaslab_class_get_space(
6962 spa_dedup_class(spa));
6963
6964 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
6965 "Dedup class", (u_longlong_t)alloc,
6966 100.0 * alloc / space);
6967 }
6968
6969 if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6970 uint64_t alloc = metaslab_class_get_alloc(
6971 spa_embedded_log_class(spa));
6972 uint64_t space = metaslab_class_get_space(
6973 spa_embedded_log_class(spa));
6974
6975 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
6976 "Embedded log class", (u_longlong_t)alloc,
6977 100.0 * alloc / space);
6978 }
6979
6980 for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
6981 if (zcb->zcb_embedded_blocks[i] == 0)
6982 continue;
6983 (void) printf("\n");
6984 (void) printf("\tadditional, non-pointer bps of type %u: "
6985 "%10llu\n",
6986 i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
6987
6988 if (dump_opt['b'] >= 3) {
6989 (void) printf("\t number of (compressed) bytes: "
6990 "number of bps\n");
6991 dump_histogram(zcb->zcb_embedded_histogram[i],
6992 sizeof (zcb->zcb_embedded_histogram[i]) /
6993 sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
6994 }
6995 }
6996
6997 if (tzb->zb_ditto_samevdev != 0) {
6998 (void) printf("\tDittoed blocks on same vdev: %llu\n",
6999 (longlong_t)tzb->zb_ditto_samevdev);
7000 }
7001 if (tzb->zb_ditto_same_ms != 0) {
7002 (void) printf("\tDittoed blocks in same metaslab: %llu\n",
7003 (longlong_t)tzb->zb_ditto_same_ms);
7004 }
7005
7006 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
7007 vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
7008 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
7009
7010 if (vim == NULL) {
7011 continue;
7012 }
7013
7014 char mem[32];
7015 zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
7016 mem, vdev_indirect_mapping_size(vim));
7017
7018 (void) printf("\tindirect vdev id %llu has %llu segments "
7019 "(%s in memory)\n",
7020 (longlong_t)vd->vdev_id,
7021 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
7022 }
7023
7024 if (dump_opt['b'] >= 2) {
7025 int l, t, level;
7026 char csize[32], lsize[32], psize[32], asize[32];
7027 char avg[32], gang[32];
7028 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
7029 "\t avg\t comp\t%%Total\tType\n");
7030
7031 zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
7032 UMEM_NOFAIL);
7033
7034 for (t = 0; t <= ZDB_OT_TOTAL; t++) {
7035 const char *typename;
7036
7037 /* make sure nicenum has enough space */
7038 _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
7039 "csize truncated");
7040 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
7041 "lsize truncated");
7042 _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
7043 "psize truncated");
7044 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
7045 "asize truncated");
7046 _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
7047 "avg truncated");
7048 _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
7049 "gang truncated");
7050
7051 if (t < DMU_OT_NUMTYPES)
7052 typename = dmu_ot[t].ot_name;
7053 else
7054 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
7055
7056 if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
7057 (void) printf("%6s\t%5s\t%5s\t%5s"
7058 "\t%5s\t%5s\t%6s\t%s\n",
7059 "-",
7060 "-",
7061 "-",
7062 "-",
7063 "-",
7064 "-",
7065 "-",
7066 typename);
7067 continue;
7068 }
7069
7070 for (l = ZB_TOTAL - 1; l >= -1; l--) {
7071 level = (l == -1 ? ZB_TOTAL : l);
7072 zb = &zcb->zcb_type[level][t];
7073
7074 if (zb->zb_asize == 0)
7075 continue;
7076
7077 if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
7078 (level > 0 || DMU_OT_IS_METADATA(t))) {
7079 mdstats->zb_count += zb->zb_count;
7080 mdstats->zb_lsize += zb->zb_lsize;
7081 mdstats->zb_psize += zb->zb_psize;
7082 mdstats->zb_asize += zb->zb_asize;
7083 mdstats->zb_gangs += zb->zb_gangs;
7084 }
7085
7086 if (dump_opt['b'] < 3 && level != ZB_TOTAL)
7087 continue;
7088
7089 if (level == 0 && zb->zb_asize ==
7090 zcb->zcb_type[ZB_TOTAL][t].zb_asize)
7091 continue;
7092
7093 zdb_nicenum(zb->zb_count, csize,
7094 sizeof (csize));
7095 zdb_nicenum(zb->zb_lsize, lsize,
7096 sizeof (lsize));
7097 zdb_nicenum(zb->zb_psize, psize,
7098 sizeof (psize));
7099 zdb_nicenum(zb->zb_asize, asize,
7100 sizeof (asize));
7101 zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
7102 sizeof (avg));
7103 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
7104
7105 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
7106 "\t%5.2f\t%6.2f\t",
7107 csize, lsize, psize, asize, avg,
7108 (double)zb->zb_lsize / zb->zb_psize,
7109 100.0 * zb->zb_asize / tzb->zb_asize);
7110
7111 if (level == ZB_TOTAL)
7112 (void) printf("%s\n", typename);
7113 else
7114 (void) printf(" L%d %s\n",
7115 level, typename);
7116
7117 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
7118 (void) printf("\t number of ganged "
7119 "blocks: %s\n", gang);
7120 }
7121
7122 if (dump_opt['b'] >= 4) {
7123 (void) printf("psize "
7124 "(in 512-byte sectors): "
7125 "number of blocks\n");
7126 dump_histogram(zb->zb_psize_histogram,
7127 PSIZE_HISTO_SIZE, 0);
7128 }
7129 }
7130 }
7131 zdb_nicenum(mdstats->zb_count, csize,
7132 sizeof (csize));
7133 zdb_nicenum(mdstats->zb_lsize, lsize,
7134 sizeof (lsize));
7135 zdb_nicenum(mdstats->zb_psize, psize,
7136 sizeof (psize));
7137 zdb_nicenum(mdstats->zb_asize, asize,
7138 sizeof (asize));
7139 zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
7140 sizeof (avg));
7141 zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
7142
7143 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
7144 "\t%5.2f\t%6.2f\t",
7145 csize, lsize, psize, asize, avg,
7146 (double)mdstats->zb_lsize / mdstats->zb_psize,
7147 100.0 * mdstats->zb_asize / tzb->zb_asize);
7148 (void) printf("%s\n", "Metadata Total");
7149
7150 /* Output a table summarizing block sizes in the pool */
7151 if (dump_opt['b'] >= 2) {
7152 dump_size_histograms(zcb);
7153 }
7154
7155 umem_free(mdstats, sizeof (zfs_blkstat_t));
7156 }
7157
7158 (void) printf("\n");
7159
7160 if (leaks) {
7161 umem_free(zcb, sizeof (zdb_cb_t));
7162 return (2);
7163 }
7164
7165 if (zcb->zcb_haderrors) {
7166 umem_free(zcb, sizeof (zdb_cb_t));
7167 return (3);
7168 }
7169
7170 umem_free(zcb, sizeof (zdb_cb_t));
7171 return (0);
7172 }
7173
7174 typedef struct zdb_ddt_entry {
7175 /* key must be first for ddt_key_compare */
7176 ddt_key_t zdde_key;
7177 uint64_t zdde_ref_blocks;
7178 uint64_t zdde_ref_lsize;
7179 uint64_t zdde_ref_psize;
7180 uint64_t zdde_ref_dsize;
7181 avl_node_t zdde_node;
7182 } zdb_ddt_entry_t;
7183
7184 static int
zdb_ddt_add_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)7185 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
7186 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
7187 {
7188 (void) zilog, (void) dnp;
7189 avl_tree_t *t = arg;
7190 avl_index_t where;
7191 zdb_ddt_entry_t *zdde, zdde_search;
7192
7193 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
7194 BP_IS_EMBEDDED(bp))
7195 return (0);
7196
7197 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
7198 (void) printf("traversing objset %llu, %llu objects, "
7199 "%lu blocks so far\n",
7200 (u_longlong_t)zb->zb_objset,
7201 (u_longlong_t)BP_GET_FILL(bp),
7202 avl_numnodes(t));
7203 }
7204
7205 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
7206 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
7207 return (0);
7208
7209 ddt_key_fill(&zdde_search.zdde_key, bp);
7210
7211 zdde = avl_find(t, &zdde_search, &where);
7212
7213 if (zdde == NULL) {
7214 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
7215 zdde->zdde_key = zdde_search.zdde_key;
7216 avl_insert(t, zdde, where);
7217 }
7218
7219 zdde->zdde_ref_blocks += 1;
7220 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
7221 zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
7222 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
7223
7224 return (0);
7225 }
7226
7227 static void
dump_simulated_ddt(spa_t * spa)7228 dump_simulated_ddt(spa_t *spa)
7229 {
7230 avl_tree_t t;
7231 void *cookie = NULL;
7232 zdb_ddt_entry_t *zdde;
7233 ddt_histogram_t ddh_total = {{{0}}};
7234 ddt_stat_t dds_total = {0};
7235
7236 avl_create(&t, ddt_key_compare,
7237 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
7238
7239 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7240
7241 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
7242 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
7243
7244 spa_config_exit(spa, SCL_CONFIG, FTAG);
7245
7246 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
7247 ddt_stat_t dds;
7248 uint64_t refcnt = zdde->zdde_ref_blocks;
7249 ASSERT(refcnt != 0);
7250
7251 dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
7252 dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
7253 dds.dds_psize = zdde->zdde_ref_psize / refcnt;
7254 dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
7255
7256 dds.dds_ref_blocks = zdde->zdde_ref_blocks;
7257 dds.dds_ref_lsize = zdde->zdde_ref_lsize;
7258 dds.dds_ref_psize = zdde->zdde_ref_psize;
7259 dds.dds_ref_dsize = zdde->zdde_ref_dsize;
7260
7261 ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
7262 &dds, 0);
7263
7264 umem_free(zdde, sizeof (*zdde));
7265 }
7266
7267 avl_destroy(&t);
7268
7269 ddt_histogram_stat(&dds_total, &ddh_total);
7270
7271 (void) printf("Simulated DDT histogram:\n");
7272
7273 zpool_dump_ddt(&dds_total, &ddh_total);
7274
7275 dump_dedup_ratio(&dds_total);
7276 }
7277
7278 static int
verify_device_removal_feature_counts(spa_t * spa)7279 verify_device_removal_feature_counts(spa_t *spa)
7280 {
7281 uint64_t dr_feature_refcount = 0;
7282 uint64_t oc_feature_refcount = 0;
7283 uint64_t indirect_vdev_count = 0;
7284 uint64_t precise_vdev_count = 0;
7285 uint64_t obsolete_counts_object_count = 0;
7286 uint64_t obsolete_sm_count = 0;
7287 uint64_t obsolete_counts_count = 0;
7288 uint64_t scip_count = 0;
7289 uint64_t obsolete_bpobj_count = 0;
7290 int ret = 0;
7291
7292 spa_condensing_indirect_phys_t *scip =
7293 &spa->spa_condensing_indirect_phys;
7294 if (scip->scip_next_mapping_object != 0) {
7295 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
7296 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
7297 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
7298
7299 (void) printf("Condensing indirect vdev %llu: new mapping "
7300 "object %llu, prev obsolete sm %llu\n",
7301 (u_longlong_t)scip->scip_vdev,
7302 (u_longlong_t)scip->scip_next_mapping_object,
7303 (u_longlong_t)scip->scip_prev_obsolete_sm_object);
7304 if (scip->scip_prev_obsolete_sm_object != 0) {
7305 space_map_t *prev_obsolete_sm = NULL;
7306 VERIFY0(space_map_open(&prev_obsolete_sm,
7307 spa->spa_meta_objset,
7308 scip->scip_prev_obsolete_sm_object,
7309 0, vd->vdev_asize, 0));
7310 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
7311 (void) printf("\n");
7312 space_map_close(prev_obsolete_sm);
7313 }
7314
7315 scip_count += 2;
7316 }
7317
7318 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
7319 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
7320 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
7321
7322 if (vic->vic_mapping_object != 0) {
7323 ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
7324 vd->vdev_removing);
7325 indirect_vdev_count++;
7326
7327 if (vd->vdev_indirect_mapping->vim_havecounts) {
7328 obsolete_counts_count++;
7329 }
7330 }
7331
7332 boolean_t are_precise;
7333 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
7334 if (are_precise) {
7335 ASSERT(vic->vic_mapping_object != 0);
7336 precise_vdev_count++;
7337 }
7338
7339 uint64_t obsolete_sm_object;
7340 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
7341 if (obsolete_sm_object != 0) {
7342 ASSERT(vic->vic_mapping_object != 0);
7343 obsolete_sm_count++;
7344 }
7345 }
7346
7347 (void) feature_get_refcount(spa,
7348 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
7349 &dr_feature_refcount);
7350 (void) feature_get_refcount(spa,
7351 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
7352 &oc_feature_refcount);
7353
7354 if (dr_feature_refcount != indirect_vdev_count) {
7355 ret = 1;
7356 (void) printf("Number of indirect vdevs (%llu) " \
7357 "does not match feature count (%llu)\n",
7358 (u_longlong_t)indirect_vdev_count,
7359 (u_longlong_t)dr_feature_refcount);
7360 } else {
7361 (void) printf("Verified device_removal feature refcount " \
7362 "of %llu is correct\n",
7363 (u_longlong_t)dr_feature_refcount);
7364 }
7365
7366 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
7367 DMU_POOL_OBSOLETE_BPOBJ) == 0) {
7368 obsolete_bpobj_count++;
7369 }
7370
7371
7372 obsolete_counts_object_count = precise_vdev_count;
7373 obsolete_counts_object_count += obsolete_sm_count;
7374 obsolete_counts_object_count += obsolete_counts_count;
7375 obsolete_counts_object_count += scip_count;
7376 obsolete_counts_object_count += obsolete_bpobj_count;
7377 obsolete_counts_object_count += remap_deadlist_count;
7378
7379 if (oc_feature_refcount != obsolete_counts_object_count) {
7380 ret = 1;
7381 (void) printf("Number of obsolete counts objects (%llu) " \
7382 "does not match feature count (%llu)\n",
7383 (u_longlong_t)obsolete_counts_object_count,
7384 (u_longlong_t)oc_feature_refcount);
7385 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
7386 "ob:%llu rd:%llu\n",
7387 (u_longlong_t)precise_vdev_count,
7388 (u_longlong_t)obsolete_sm_count,
7389 (u_longlong_t)obsolete_counts_count,
7390 (u_longlong_t)scip_count,
7391 (u_longlong_t)obsolete_bpobj_count,
7392 (u_longlong_t)remap_deadlist_count);
7393 } else {
7394 (void) printf("Verified indirect_refcount feature refcount " \
7395 "of %llu is correct\n",
7396 (u_longlong_t)oc_feature_refcount);
7397 }
7398 return (ret);
7399 }
7400
7401 static void
zdb_set_skip_mmp(char * target)7402 zdb_set_skip_mmp(char *target)
7403 {
7404 spa_t *spa;
7405
7406 /*
7407 * Disable the activity check to allow examination of
7408 * active pools.
7409 */
7410 mutex_enter(&spa_namespace_lock);
7411 if ((spa = spa_lookup(target)) != NULL) {
7412 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
7413 }
7414 mutex_exit(&spa_namespace_lock);
7415 }
7416
7417 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
7418 /*
7419 * Import the checkpointed state of the pool specified by the target
7420 * parameter as readonly. The function also accepts a pool config
7421 * as an optional parameter, else it attempts to infer the config by
7422 * the name of the target pool.
7423 *
7424 * Note that the checkpointed state's pool name will be the name of
7425 * the original pool with the above suffix appended to it. In addition,
7426 * if the target is not a pool name (e.g. a path to a dataset) then
7427 * the new_path parameter is populated with the updated path to
7428 * reflect the fact that we are looking into the checkpointed state.
7429 *
7430 * The function returns a newly-allocated copy of the name of the
7431 * pool containing the checkpointed state. When this copy is no
7432 * longer needed it should be freed with free(3C). Same thing
7433 * applies to the new_path parameter if allocated.
7434 */
7435 static char *
import_checkpointed_state(char * target,nvlist_t * cfg,char ** new_path)7436 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
7437 {
7438 int error = 0;
7439 char *poolname, *bogus_name = NULL;
7440 boolean_t freecfg = B_FALSE;
7441
7442 /* If the target is not a pool, the extract the pool name */
7443 char *path_start = strchr(target, '/');
7444 if (path_start != NULL) {
7445 size_t poolname_len = path_start - target;
7446 poolname = strndup(target, poolname_len);
7447 } else {
7448 poolname = target;
7449 }
7450
7451 if (cfg == NULL) {
7452 zdb_set_skip_mmp(poolname);
7453 error = spa_get_stats(poolname, &cfg, NULL, 0);
7454 if (error != 0) {
7455 fatal("Tried to read config of pool \"%s\" but "
7456 "spa_get_stats() failed with error %d\n",
7457 poolname, error);
7458 }
7459 freecfg = B_TRUE;
7460 }
7461
7462 if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
7463 if (target != poolname)
7464 free(poolname);
7465 return (NULL);
7466 }
7467 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
7468
7469 error = spa_import(bogus_name, cfg, NULL,
7470 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
7471 ZFS_IMPORT_SKIP_MMP);
7472 if (freecfg)
7473 nvlist_free(cfg);
7474 if (error != 0) {
7475 fatal("Tried to import pool \"%s\" but spa_import() failed "
7476 "with error %d\n", bogus_name, error);
7477 }
7478
7479 if (new_path != NULL && path_start != NULL) {
7480 if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
7481 free(bogus_name);
7482 if (path_start != NULL)
7483 free(poolname);
7484 return (NULL);
7485 }
7486 }
7487
7488 if (target != poolname)
7489 free(poolname);
7490
7491 return (bogus_name);
7492 }
7493
7494 typedef struct verify_checkpoint_sm_entry_cb_arg {
7495 vdev_t *vcsec_vd;
7496
7497 /* the following fields are only used for printing progress */
7498 uint64_t vcsec_entryid;
7499 uint64_t vcsec_num_entries;
7500 } verify_checkpoint_sm_entry_cb_arg_t;
7501
7502 #define ENTRIES_PER_PROGRESS_UPDATE 10000
7503
7504 static int
verify_checkpoint_sm_entry_cb(space_map_entry_t * sme,void * arg)7505 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
7506 {
7507 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
7508 vdev_t *vd = vcsec->vcsec_vd;
7509 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
7510 uint64_t end = sme->sme_offset + sme->sme_run;
7511
7512 ASSERT(sme->sme_type == SM_FREE);
7513
7514 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
7515 (void) fprintf(stderr,
7516 "\rverifying vdev %llu, space map entry %llu of %llu ...",
7517 (longlong_t)vd->vdev_id,
7518 (longlong_t)vcsec->vcsec_entryid,
7519 (longlong_t)vcsec->vcsec_num_entries);
7520 }
7521 vcsec->vcsec_entryid++;
7522
7523 /*
7524 * See comment in checkpoint_sm_exclude_entry_cb()
7525 */
7526 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
7527 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
7528
7529 /*
7530 * The entries in the vdev_checkpoint_sm should be marked as
7531 * allocated in the checkpointed state of the pool, therefore
7532 * their respective ms_allocateable trees should not contain them.
7533 */
7534 mutex_enter(&ms->ms_lock);
7535 range_tree_verify_not_present(ms->ms_allocatable,
7536 sme->sme_offset, sme->sme_run);
7537 mutex_exit(&ms->ms_lock);
7538
7539 return (0);
7540 }
7541
7542 /*
7543 * Verify that all segments in the vdev_checkpoint_sm are allocated
7544 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
7545 * ms_allocatable).
7546 *
7547 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
7548 * each vdev in the current state of the pool to the metaslab space maps
7549 * (ms_sm) of the checkpointed state of the pool.
7550 *
7551 * Note that the function changes the state of the ms_allocatable
7552 * trees of the current spa_t. The entries of these ms_allocatable
7553 * trees are cleared out and then repopulated from with the free
7554 * entries of their respective ms_sm space maps.
7555 */
7556 static void
verify_checkpoint_vdev_spacemaps(spa_t * checkpoint,spa_t * current)7557 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
7558 {
7559 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
7560 vdev_t *current_rvd = current->spa_root_vdev;
7561
7562 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
7563
7564 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
7565 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
7566 vdev_t *current_vd = current_rvd->vdev_child[c];
7567
7568 space_map_t *checkpoint_sm = NULL;
7569 uint64_t checkpoint_sm_obj;
7570
7571 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
7572 /*
7573 * Since we don't allow device removal in a pool
7574 * that has a checkpoint, we expect that all removed
7575 * vdevs were removed from the pool before the
7576 * checkpoint.
7577 */
7578 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
7579 continue;
7580 }
7581
7582 /*
7583 * If the checkpoint space map doesn't exist, then nothing
7584 * here is checkpointed so there's nothing to verify.
7585 */
7586 if (current_vd->vdev_top_zap == 0 ||
7587 zap_contains(spa_meta_objset(current),
7588 current_vd->vdev_top_zap,
7589 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
7590 continue;
7591
7592 VERIFY0(zap_lookup(spa_meta_objset(current),
7593 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
7594 sizeof (uint64_t), 1, &checkpoint_sm_obj));
7595
7596 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
7597 checkpoint_sm_obj, 0, current_vd->vdev_asize,
7598 current_vd->vdev_ashift));
7599
7600 verify_checkpoint_sm_entry_cb_arg_t vcsec;
7601 vcsec.vcsec_vd = ckpoint_vd;
7602 vcsec.vcsec_entryid = 0;
7603 vcsec.vcsec_num_entries =
7604 space_map_length(checkpoint_sm) / sizeof (uint64_t);
7605 VERIFY0(space_map_iterate(checkpoint_sm,
7606 space_map_length(checkpoint_sm),
7607 verify_checkpoint_sm_entry_cb, &vcsec));
7608 if (dump_opt['m'] > 3)
7609 dump_spacemap(current->spa_meta_objset, checkpoint_sm);
7610 space_map_close(checkpoint_sm);
7611 }
7612
7613 /*
7614 * If we've added vdevs since we took the checkpoint, ensure
7615 * that their checkpoint space maps are empty.
7616 */
7617 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
7618 for (uint64_t c = ckpoint_rvd->vdev_children;
7619 c < current_rvd->vdev_children; c++) {
7620 vdev_t *current_vd = current_rvd->vdev_child[c];
7621 VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL);
7622 }
7623 }
7624
7625 /* for cleaner progress output */
7626 (void) fprintf(stderr, "\n");
7627 }
7628
7629 /*
7630 * Verifies that all space that's allocated in the checkpoint is
7631 * still allocated in the current version, by checking that everything
7632 * in checkpoint's ms_allocatable (which is actually allocated, not
7633 * allocatable/free) is not present in current's ms_allocatable.
7634 *
7635 * Note that the function changes the state of the ms_allocatable
7636 * trees of both spas when called. The entries of all ms_allocatable
7637 * trees are cleared out and then repopulated from their respective
7638 * ms_sm space maps. In the checkpointed state we load the allocated
7639 * entries, and in the current state we load the free entries.
7640 */
7641 static void
verify_checkpoint_ms_spacemaps(spa_t * checkpoint,spa_t * current)7642 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
7643 {
7644 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
7645 vdev_t *current_rvd = current->spa_root_vdev;
7646
7647 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
7648 load_concrete_ms_allocatable_trees(current, SM_FREE);
7649
7650 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
7651 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
7652 vdev_t *current_vd = current_rvd->vdev_child[i];
7653
7654 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
7655 /*
7656 * See comment in verify_checkpoint_vdev_spacemaps()
7657 */
7658 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
7659 continue;
7660 }
7661
7662 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
7663 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
7664 metaslab_t *current_msp = current_vd->vdev_ms[m];
7665
7666 (void) fprintf(stderr,
7667 "\rverifying vdev %llu of %llu, "
7668 "metaslab %llu of %llu ...",
7669 (longlong_t)current_vd->vdev_id,
7670 (longlong_t)current_rvd->vdev_children,
7671 (longlong_t)current_vd->vdev_ms[m]->ms_id,
7672 (longlong_t)current_vd->vdev_ms_count);
7673
7674 /*
7675 * We walk through the ms_allocatable trees that
7676 * are loaded with the allocated blocks from the
7677 * ms_sm spacemaps of the checkpoint. For each
7678 * one of these ranges we ensure that none of them
7679 * exists in the ms_allocatable trees of the
7680 * current state which are loaded with the ranges
7681 * that are currently free.
7682 *
7683 * This way we ensure that none of the blocks that
7684 * are part of the checkpoint were freed by mistake.
7685 */
7686 range_tree_walk(ckpoint_msp->ms_allocatable,
7687 (range_tree_func_t *)range_tree_verify_not_present,
7688 current_msp->ms_allocatable);
7689 }
7690 }
7691
7692 /* for cleaner progress output */
7693 (void) fprintf(stderr, "\n");
7694 }
7695
7696 static void
verify_checkpoint_blocks(spa_t * spa)7697 verify_checkpoint_blocks(spa_t *spa)
7698 {
7699 ASSERT(!dump_opt['L']);
7700
7701 spa_t *checkpoint_spa;
7702 char *checkpoint_pool;
7703 int error = 0;
7704
7705 /*
7706 * We import the checkpointed state of the pool (under a different
7707 * name) so we can do verification on it against the current state
7708 * of the pool.
7709 */
7710 checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
7711 NULL);
7712 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
7713
7714 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
7715 if (error != 0) {
7716 fatal("Tried to open pool \"%s\" but spa_open() failed with "
7717 "error %d\n", checkpoint_pool, error);
7718 }
7719
7720 /*
7721 * Ensure that ranges in the checkpoint space maps of each vdev
7722 * are allocated according to the checkpointed state's metaslab
7723 * space maps.
7724 */
7725 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
7726
7727 /*
7728 * Ensure that allocated ranges in the checkpoint's metaslab
7729 * space maps remain allocated in the metaslab space maps of
7730 * the current state.
7731 */
7732 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
7733
7734 /*
7735 * Once we are done, we get rid of the checkpointed state.
7736 */
7737 spa_close(checkpoint_spa, FTAG);
7738 free(checkpoint_pool);
7739 }
7740
7741 static void
dump_leftover_checkpoint_blocks(spa_t * spa)7742 dump_leftover_checkpoint_blocks(spa_t *spa)
7743 {
7744 vdev_t *rvd = spa->spa_root_vdev;
7745
7746 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
7747 vdev_t *vd = rvd->vdev_child[i];
7748
7749 space_map_t *checkpoint_sm = NULL;
7750 uint64_t checkpoint_sm_obj;
7751
7752 if (vd->vdev_top_zap == 0)
7753 continue;
7754
7755 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
7756 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
7757 continue;
7758
7759 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
7760 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
7761 sizeof (uint64_t), 1, &checkpoint_sm_obj));
7762
7763 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
7764 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
7765 dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
7766 space_map_close(checkpoint_sm);
7767 }
7768 }
7769
7770 static int
verify_checkpoint(spa_t * spa)7771 verify_checkpoint(spa_t *spa)
7772 {
7773 uberblock_t checkpoint;
7774 int error;
7775
7776 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
7777 return (0);
7778
7779 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
7780 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
7781 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
7782
7783 if (error == ENOENT && !dump_opt['L']) {
7784 /*
7785 * If the feature is active but the uberblock is missing
7786 * then we must be in the middle of discarding the
7787 * checkpoint.
7788 */
7789 (void) printf("\nPartially discarded checkpoint "
7790 "state found:\n");
7791 if (dump_opt['m'] > 3)
7792 dump_leftover_checkpoint_blocks(spa);
7793 return (0);
7794 } else if (error != 0) {
7795 (void) printf("lookup error %d when looking for "
7796 "checkpointed uberblock in MOS\n", error);
7797 return (error);
7798 }
7799 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
7800
7801 if (checkpoint.ub_checkpoint_txg == 0) {
7802 (void) printf("\nub_checkpoint_txg not set in checkpointed "
7803 "uberblock\n");
7804 error = 3;
7805 }
7806
7807 if (error == 0 && !dump_opt['L'])
7808 verify_checkpoint_blocks(spa);
7809
7810 return (error);
7811 }
7812
7813 static void
mos_leaks_cb(void * arg,uint64_t start,uint64_t size)7814 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
7815 {
7816 (void) arg;
7817 for (uint64_t i = start; i < size; i++) {
7818 (void) printf("MOS object %llu referenced but not allocated\n",
7819 (u_longlong_t)i);
7820 }
7821 }
7822
7823 static void
mos_obj_refd(uint64_t obj)7824 mos_obj_refd(uint64_t obj)
7825 {
7826 if (obj != 0 && mos_refd_objs != NULL)
7827 range_tree_add(mos_refd_objs, obj, 1);
7828 }
7829
7830 /*
7831 * Call on a MOS object that may already have been referenced.
7832 */
7833 static void
mos_obj_refd_multiple(uint64_t obj)7834 mos_obj_refd_multiple(uint64_t obj)
7835 {
7836 if (obj != 0 && mos_refd_objs != NULL &&
7837 !range_tree_contains(mos_refd_objs, obj, 1))
7838 range_tree_add(mos_refd_objs, obj, 1);
7839 }
7840
7841 static void
mos_leak_vdev_top_zap(vdev_t * vd)7842 mos_leak_vdev_top_zap(vdev_t *vd)
7843 {
7844 uint64_t ms_flush_data_obj;
7845 int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
7846 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
7847 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
7848 if (error == ENOENT)
7849 return;
7850 ASSERT0(error);
7851
7852 mos_obj_refd(ms_flush_data_obj);
7853 }
7854
7855 static void
mos_leak_vdev(vdev_t * vd)7856 mos_leak_vdev(vdev_t *vd)
7857 {
7858 mos_obj_refd(vd->vdev_dtl_object);
7859 mos_obj_refd(vd->vdev_ms_array);
7860 mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
7861 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
7862 mos_obj_refd(vd->vdev_leaf_zap);
7863 if (vd->vdev_checkpoint_sm != NULL)
7864 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
7865 if (vd->vdev_indirect_mapping != NULL) {
7866 mos_obj_refd(vd->vdev_indirect_mapping->
7867 vim_phys->vimp_counts_object);
7868 }
7869 if (vd->vdev_obsolete_sm != NULL)
7870 mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
7871
7872 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
7873 metaslab_t *ms = vd->vdev_ms[m];
7874 mos_obj_refd(space_map_object(ms->ms_sm));
7875 }
7876
7877 if (vd->vdev_root_zap != 0)
7878 mos_obj_refd(vd->vdev_root_zap);
7879
7880 if (vd->vdev_top_zap != 0) {
7881 mos_obj_refd(vd->vdev_top_zap);
7882 mos_leak_vdev_top_zap(vd);
7883 }
7884
7885 for (uint64_t c = 0; c < vd->vdev_children; c++) {
7886 mos_leak_vdev(vd->vdev_child[c]);
7887 }
7888 }
7889
7890 static void
mos_leak_log_spacemaps(spa_t * spa)7891 mos_leak_log_spacemaps(spa_t *spa)
7892 {
7893 uint64_t spacemap_zap;
7894 int error = zap_lookup(spa_meta_objset(spa),
7895 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
7896 sizeof (spacemap_zap), 1, &spacemap_zap);
7897 if (error == ENOENT)
7898 return;
7899 ASSERT0(error);
7900
7901 mos_obj_refd(spacemap_zap);
7902 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
7903 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
7904 mos_obj_refd(sls->sls_sm_obj);
7905 }
7906
7907 static void
errorlog_count_refd(objset_t * mos,uint64_t errlog)7908 errorlog_count_refd(objset_t *mos, uint64_t errlog)
7909 {
7910 zap_cursor_t zc;
7911 zap_attribute_t za;
7912 for (zap_cursor_init(&zc, mos, errlog);
7913 zap_cursor_retrieve(&zc, &za) == 0;
7914 zap_cursor_advance(&zc)) {
7915 mos_obj_refd(za.za_first_integer);
7916 }
7917 zap_cursor_fini(&zc);
7918 }
7919
7920 static int
dump_mos_leaks(spa_t * spa)7921 dump_mos_leaks(spa_t *spa)
7922 {
7923 int rv = 0;
7924 objset_t *mos = spa->spa_meta_objset;
7925 dsl_pool_t *dp = spa->spa_dsl_pool;
7926
7927 /* Visit and mark all referenced objects in the MOS */
7928
7929 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
7930 mos_obj_refd(spa->spa_pool_props_object);
7931 mos_obj_refd(spa->spa_config_object);
7932 mos_obj_refd(spa->spa_ddt_stat_object);
7933 mos_obj_refd(spa->spa_feat_desc_obj);
7934 mos_obj_refd(spa->spa_feat_enabled_txg_obj);
7935 mos_obj_refd(spa->spa_feat_for_read_obj);
7936 mos_obj_refd(spa->spa_feat_for_write_obj);
7937 mos_obj_refd(spa->spa_history);
7938 mos_obj_refd(spa->spa_errlog_last);
7939 mos_obj_refd(spa->spa_errlog_scrub);
7940
7941 if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
7942 errorlog_count_refd(mos, spa->spa_errlog_last);
7943 errorlog_count_refd(mos, spa->spa_errlog_scrub);
7944 }
7945
7946 mos_obj_refd(spa->spa_all_vdev_zaps);
7947 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
7948 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
7949 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
7950 bpobj_count_refd(&spa->spa_deferred_bpobj);
7951 mos_obj_refd(dp->dp_empty_bpobj);
7952 bpobj_count_refd(&dp->dp_obsolete_bpobj);
7953 bpobj_count_refd(&dp->dp_free_bpobj);
7954 mos_obj_refd(spa->spa_l2cache.sav_object);
7955 mos_obj_refd(spa->spa_spares.sav_object);
7956
7957 if (spa->spa_syncing_log_sm != NULL)
7958 mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
7959 mos_leak_log_spacemaps(spa);
7960
7961 mos_obj_refd(spa->spa_condensing_indirect_phys.
7962 scip_next_mapping_object);
7963 mos_obj_refd(spa->spa_condensing_indirect_phys.
7964 scip_prev_obsolete_sm_object);
7965 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
7966 vdev_indirect_mapping_t *vim =
7967 vdev_indirect_mapping_open(mos,
7968 spa->spa_condensing_indirect_phys.scip_next_mapping_object);
7969 mos_obj_refd(vim->vim_phys->vimp_counts_object);
7970 vdev_indirect_mapping_close(vim);
7971 }
7972 deleted_livelists_dump_mos(spa);
7973
7974 if (dp->dp_origin_snap != NULL) {
7975 dsl_dataset_t *ds;
7976
7977 dsl_pool_config_enter(dp, FTAG);
7978 VERIFY0(dsl_dataset_hold_obj(dp,
7979 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
7980 FTAG, &ds));
7981 count_ds_mos_objects(ds);
7982 dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
7983 dsl_dataset_rele(ds, FTAG);
7984 dsl_pool_config_exit(dp, FTAG);
7985
7986 count_ds_mos_objects(dp->dp_origin_snap);
7987 dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
7988 }
7989 count_dir_mos_objects(dp->dp_mos_dir);
7990 if (dp->dp_free_dir != NULL)
7991 count_dir_mos_objects(dp->dp_free_dir);
7992 if (dp->dp_leak_dir != NULL)
7993 count_dir_mos_objects(dp->dp_leak_dir);
7994
7995 mos_leak_vdev(spa->spa_root_vdev);
7996
7997 for (uint64_t class = 0; class < DDT_CLASSES; class++) {
7998 for (uint64_t type = 0; type < DDT_TYPES; type++) {
7999 for (uint64_t cksum = 0;
8000 cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
8001 ddt_t *ddt = spa->spa_ddt[cksum];
8002 if (!ddt)
8003 continue;
8004 mos_obj_refd(ddt->ddt_object[type][class]);
8005 }
8006 }
8007 }
8008
8009 if (spa->spa_brt != NULL) {
8010 brt_t *brt = spa->spa_brt;
8011 for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
8012 brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
8013 if (brtvd != NULL && brtvd->bv_initiated) {
8014 mos_obj_refd(brtvd->bv_mos_brtvdev);
8015 mos_obj_refd(brtvd->bv_mos_entries);
8016 }
8017 }
8018 }
8019
8020 /*
8021 * Visit all allocated objects and make sure they are referenced.
8022 */
8023 uint64_t object = 0;
8024 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
8025 if (range_tree_contains(mos_refd_objs, object, 1)) {
8026 range_tree_remove(mos_refd_objs, object, 1);
8027 } else {
8028 dmu_object_info_t doi;
8029 const char *name;
8030 VERIFY0(dmu_object_info(mos, object, &doi));
8031 if (doi.doi_type & DMU_OT_NEWTYPE) {
8032 dmu_object_byteswap_t bswap =
8033 DMU_OT_BYTESWAP(doi.doi_type);
8034 name = dmu_ot_byteswap[bswap].ob_name;
8035 } else {
8036 name = dmu_ot[doi.doi_type].ot_name;
8037 }
8038
8039 (void) printf("MOS object %llu (%s) leaked\n",
8040 (u_longlong_t)object, name);
8041 rv = 2;
8042 }
8043 }
8044 (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
8045 if (!range_tree_is_empty(mos_refd_objs))
8046 rv = 2;
8047 range_tree_vacate(mos_refd_objs, NULL, NULL);
8048 range_tree_destroy(mos_refd_objs);
8049 return (rv);
8050 }
8051
8052 typedef struct log_sm_obsolete_stats_arg {
8053 uint64_t lsos_current_txg;
8054
8055 uint64_t lsos_total_entries;
8056 uint64_t lsos_valid_entries;
8057
8058 uint64_t lsos_sm_entries;
8059 uint64_t lsos_valid_sm_entries;
8060 } log_sm_obsolete_stats_arg_t;
8061
8062 static int
log_spacemap_obsolete_stats_cb(spa_t * spa,space_map_entry_t * sme,uint64_t txg,void * arg)8063 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
8064 uint64_t txg, void *arg)
8065 {
8066 log_sm_obsolete_stats_arg_t *lsos = arg;
8067
8068 uint64_t offset = sme->sme_offset;
8069 uint64_t vdev_id = sme->sme_vdev;
8070
8071 if (lsos->lsos_current_txg == 0) {
8072 /* this is the first log */
8073 lsos->lsos_current_txg = txg;
8074 } else if (lsos->lsos_current_txg < txg) {
8075 /* we just changed log - print stats and reset */
8076 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
8077 (u_longlong_t)lsos->lsos_valid_sm_entries,
8078 (u_longlong_t)lsos->lsos_sm_entries,
8079 (u_longlong_t)lsos->lsos_current_txg);
8080 lsos->lsos_valid_sm_entries = 0;
8081 lsos->lsos_sm_entries = 0;
8082 lsos->lsos_current_txg = txg;
8083 }
8084 ASSERT3U(lsos->lsos_current_txg, ==, txg);
8085
8086 lsos->lsos_sm_entries++;
8087 lsos->lsos_total_entries++;
8088
8089 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
8090 if (!vdev_is_concrete(vd))
8091 return (0);
8092
8093 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
8094 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
8095
8096 if (txg < metaslab_unflushed_txg(ms))
8097 return (0);
8098 lsos->lsos_valid_sm_entries++;
8099 lsos->lsos_valid_entries++;
8100 return (0);
8101 }
8102
8103 static void
dump_log_spacemap_obsolete_stats(spa_t * spa)8104 dump_log_spacemap_obsolete_stats(spa_t *spa)
8105 {
8106 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
8107 return;
8108
8109 log_sm_obsolete_stats_arg_t lsos = {0};
8110
8111 (void) printf("Log Space Map Obsolete Entry Statistics:\n");
8112
8113 iterate_through_spacemap_logs(spa,
8114 log_spacemap_obsolete_stats_cb, &lsos);
8115
8116 /* print stats for latest log */
8117 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
8118 (u_longlong_t)lsos.lsos_valid_sm_entries,
8119 (u_longlong_t)lsos.lsos_sm_entries,
8120 (u_longlong_t)lsos.lsos_current_txg);
8121
8122 (void) printf("%-8llu valid entries out of %-8llu - total\n\n",
8123 (u_longlong_t)lsos.lsos_valid_entries,
8124 (u_longlong_t)lsos.lsos_total_entries);
8125 }
8126
8127 static void
dump_zpool(spa_t * spa)8128 dump_zpool(spa_t *spa)
8129 {
8130 dsl_pool_t *dp = spa_get_dsl(spa);
8131 int rc = 0;
8132
8133 if (dump_opt['y']) {
8134 livelist_metaslab_validate(spa);
8135 }
8136
8137 if (dump_opt['S']) {
8138 dump_simulated_ddt(spa);
8139 return;
8140 }
8141
8142 if (!dump_opt['e'] && dump_opt['C'] > 1) {
8143 (void) printf("\nCached configuration:\n");
8144 dump_nvlist(spa->spa_config, 8);
8145 }
8146
8147 if (dump_opt['C'])
8148 dump_config(spa);
8149
8150 if (dump_opt['u'])
8151 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
8152
8153 if (dump_opt['D'])
8154 dump_all_ddts(spa);
8155
8156 if (dump_opt['T'])
8157 dump_brt(spa);
8158
8159 if (dump_opt['d'] > 2 || dump_opt['m'])
8160 dump_metaslabs(spa);
8161 if (dump_opt['M'])
8162 dump_metaslab_groups(spa, dump_opt['M'] > 1);
8163 if (dump_opt['d'] > 2 || dump_opt['m']) {
8164 dump_log_spacemaps(spa);
8165 dump_log_spacemap_obsolete_stats(spa);
8166 }
8167
8168 if (dump_opt['d'] || dump_opt['i']) {
8169 spa_feature_t f;
8170 mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
8171 0);
8172 dump_objset(dp->dp_meta_objset);
8173
8174 if (dump_opt['d'] >= 3) {
8175 dsl_pool_t *dp = spa->spa_dsl_pool;
8176 dump_full_bpobj(&spa->spa_deferred_bpobj,
8177 "Deferred frees", 0);
8178 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
8179 dump_full_bpobj(&dp->dp_free_bpobj,
8180 "Pool snapshot frees", 0);
8181 }
8182 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
8183 ASSERT(spa_feature_is_enabled(spa,
8184 SPA_FEATURE_DEVICE_REMOVAL));
8185 dump_full_bpobj(&dp->dp_obsolete_bpobj,
8186 "Pool obsolete blocks", 0);
8187 }
8188
8189 if (spa_feature_is_active(spa,
8190 SPA_FEATURE_ASYNC_DESTROY)) {
8191 dump_bptree(spa->spa_meta_objset,
8192 dp->dp_bptree_obj,
8193 "Pool dataset frees");
8194 }
8195 dump_dtl(spa->spa_root_vdev, 0);
8196 }
8197
8198 for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
8199 global_feature_count[f] = UINT64_MAX;
8200 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
8201 global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
8202 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
8203 global_feature_count[SPA_FEATURE_LIVELIST] = 0;
8204
8205 (void) dmu_objset_find(spa_name(spa), dump_one_objset,
8206 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
8207
8208 if (rc == 0 && !dump_opt['L'])
8209 rc = dump_mos_leaks(spa);
8210
8211 for (f = 0; f < SPA_FEATURES; f++) {
8212 uint64_t refcount;
8213
8214 uint64_t *arr;
8215 if (!(spa_feature_table[f].fi_flags &
8216 ZFEATURE_FLAG_PER_DATASET)) {
8217 if (global_feature_count[f] == UINT64_MAX)
8218 continue;
8219 if (!spa_feature_is_enabled(spa, f)) {
8220 ASSERT0(global_feature_count[f]);
8221 continue;
8222 }
8223 arr = global_feature_count;
8224 } else {
8225 if (!spa_feature_is_enabled(spa, f)) {
8226 ASSERT0(dataset_feature_count[f]);
8227 continue;
8228 }
8229 arr = dataset_feature_count;
8230 }
8231 if (feature_get_refcount(spa, &spa_feature_table[f],
8232 &refcount) == ENOTSUP)
8233 continue;
8234 if (arr[f] != refcount) {
8235 (void) printf("%s feature refcount mismatch: "
8236 "%lld consumers != %lld refcount\n",
8237 spa_feature_table[f].fi_uname,
8238 (longlong_t)arr[f], (longlong_t)refcount);
8239 rc = 2;
8240 } else {
8241 (void) printf("Verified %s feature refcount "
8242 "of %llu is correct\n",
8243 spa_feature_table[f].fi_uname,
8244 (longlong_t)refcount);
8245 }
8246 }
8247
8248 if (rc == 0)
8249 rc = verify_device_removal_feature_counts(spa);
8250 }
8251
8252 if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
8253 rc = dump_block_stats(spa);
8254
8255 if (rc == 0)
8256 rc = verify_spacemap_refcounts(spa);
8257
8258 if (dump_opt['s'])
8259 show_pool_stats(spa);
8260
8261 if (dump_opt['h'])
8262 dump_history(spa);
8263
8264 if (rc == 0)
8265 rc = verify_checkpoint(spa);
8266
8267 if (rc != 0) {
8268 dump_debug_buffer();
8269 zdb_exit(rc);
8270 }
8271 }
8272
8273 #define ZDB_FLAG_CHECKSUM 0x0001
8274 #define ZDB_FLAG_DECOMPRESS 0x0002
8275 #define ZDB_FLAG_BSWAP 0x0004
8276 #define ZDB_FLAG_GBH 0x0008
8277 #define ZDB_FLAG_INDIRECT 0x0010
8278 #define ZDB_FLAG_RAW 0x0020
8279 #define ZDB_FLAG_PRINT_BLKPTR 0x0040
8280 #define ZDB_FLAG_VERBOSE 0x0080
8281
8282 static int flagbits[256];
8283 static char flagbitstr[16];
8284
8285 static void
zdb_print_blkptr(const blkptr_t * bp,int flags)8286 zdb_print_blkptr(const blkptr_t *bp, int flags)
8287 {
8288 char blkbuf[BP_SPRINTF_LEN];
8289
8290 if (flags & ZDB_FLAG_BSWAP)
8291 byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
8292
8293 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
8294 (void) printf("%s\n", blkbuf);
8295 }
8296
8297 static void
zdb_dump_indirect(blkptr_t * bp,int nbps,int flags)8298 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
8299 {
8300 int i;
8301
8302 for (i = 0; i < nbps; i++)
8303 zdb_print_blkptr(&bp[i], flags);
8304 }
8305
8306 static void
zdb_dump_gbh(void * buf,int flags)8307 zdb_dump_gbh(void *buf, int flags)
8308 {
8309 zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
8310 }
8311
8312 static void
zdb_dump_block_raw(void * buf,uint64_t size,int flags)8313 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
8314 {
8315 if (flags & ZDB_FLAG_BSWAP)
8316 byteswap_uint64_array(buf, size);
8317 VERIFY(write(fileno(stdout), buf, size) == size);
8318 }
8319
8320 static void
zdb_dump_block(char * label,void * buf,uint64_t size,int flags)8321 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
8322 {
8323 uint64_t *d = (uint64_t *)buf;
8324 unsigned nwords = size / sizeof (uint64_t);
8325 int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
8326 unsigned i, j;
8327 const char *hdr;
8328 char *c;
8329
8330
8331 if (do_bswap)
8332 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
8333 else
8334 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
8335
8336 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
8337
8338 #ifdef _LITTLE_ENDIAN
8339 /* correct the endianness */
8340 do_bswap = !do_bswap;
8341 #endif
8342 for (i = 0; i < nwords; i += 2) {
8343 (void) printf("%06llx: %016llx %016llx ",
8344 (u_longlong_t)(i * sizeof (uint64_t)),
8345 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
8346 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
8347
8348 c = (char *)&d[i];
8349 for (j = 0; j < 2 * sizeof (uint64_t); j++)
8350 (void) printf("%c", isprint(c[j]) ? c[j] : '.');
8351 (void) printf("\n");
8352 }
8353 }
8354
8355 /*
8356 * There are two acceptable formats:
8357 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
8358 * child[.child]* - For example: 0.1.1
8359 *
8360 * The second form can be used to specify arbitrary vdevs anywhere
8361 * in the hierarchy. For example, in a pool with a mirror of
8362 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
8363 */
8364 static vdev_t *
zdb_vdev_lookup(vdev_t * vdev,const char * path)8365 zdb_vdev_lookup(vdev_t *vdev, const char *path)
8366 {
8367 char *s, *p, *q;
8368 unsigned i;
8369
8370 if (vdev == NULL)
8371 return (NULL);
8372
8373 /* First, assume the x.x.x.x format */
8374 i = strtoul(path, &s, 10);
8375 if (s == path || (s && *s != '.' && *s != '\0'))
8376 goto name;
8377 if (i >= vdev->vdev_children)
8378 return (NULL);
8379
8380 vdev = vdev->vdev_child[i];
8381 if (s && *s == '\0')
8382 return (vdev);
8383 return (zdb_vdev_lookup(vdev, s+1));
8384
8385 name:
8386 for (i = 0; i < vdev->vdev_children; i++) {
8387 vdev_t *vc = vdev->vdev_child[i];
8388
8389 if (vc->vdev_path == NULL) {
8390 vc = zdb_vdev_lookup(vc, path);
8391 if (vc == NULL)
8392 continue;
8393 else
8394 return (vc);
8395 }
8396
8397 p = strrchr(vc->vdev_path, '/');
8398 p = p ? p + 1 : vc->vdev_path;
8399 q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
8400
8401 if (strcmp(vc->vdev_path, path) == 0)
8402 return (vc);
8403 if (strcmp(p, path) == 0)
8404 return (vc);
8405 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
8406 return (vc);
8407 }
8408
8409 return (NULL);
8410 }
8411
8412 static int
name_from_objset_id(spa_t * spa,uint64_t objset_id,char * outstr)8413 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
8414 {
8415 dsl_dataset_t *ds;
8416
8417 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
8418 int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
8419 NULL, &ds);
8420 if (error != 0) {
8421 (void) fprintf(stderr, "failed to hold objset %llu: %s\n",
8422 (u_longlong_t)objset_id, strerror(error));
8423 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
8424 return (error);
8425 }
8426 dsl_dataset_name(ds, outstr);
8427 dsl_dataset_rele(ds, NULL);
8428 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
8429 return (0);
8430 }
8431
8432 static boolean_t
zdb_parse_block_sizes(char * sizes,uint64_t * lsize,uint64_t * psize)8433 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
8434 {
8435 char *s0, *s1, *tmp = NULL;
8436
8437 if (sizes == NULL)
8438 return (B_FALSE);
8439
8440 s0 = strtok_r(sizes, "/", &tmp);
8441 if (s0 == NULL)
8442 return (B_FALSE);
8443 s1 = strtok_r(NULL, "/", &tmp);
8444 *lsize = strtoull(s0, NULL, 16);
8445 *psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
8446 return (*lsize >= *psize && *psize > 0);
8447 }
8448
8449 #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg))
8450
8451 static boolean_t
try_decompress_block(abd_t * pabd,uint64_t lsize,uint64_t psize,int flags,int cfunc,void * lbuf,void * lbuf2)8452 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
8453 int flags, int cfunc, void *lbuf, void *lbuf2)
8454 {
8455 if (flags & ZDB_FLAG_VERBOSE) {
8456 (void) fprintf(stderr,
8457 "Trying %05llx -> %05llx (%s)\n",
8458 (u_longlong_t)psize,
8459 (u_longlong_t)lsize,
8460 zio_compress_table[cfunc].ci_name);
8461 }
8462
8463 /*
8464 * We set lbuf to all zeros and lbuf2 to all
8465 * ones, then decompress to both buffers and
8466 * compare their contents. This way we can
8467 * know if decompression filled exactly to
8468 * lsize or if it left some bytes unwritten.
8469 */
8470
8471 memset(lbuf, 0x00, lsize);
8472 memset(lbuf2, 0xff, lsize);
8473
8474 if (zio_decompress_data(cfunc, pabd,
8475 lbuf, psize, lsize, NULL) == 0 &&
8476 zio_decompress_data(cfunc, pabd,
8477 lbuf2, psize, lsize, NULL) == 0 &&
8478 memcmp(lbuf, lbuf2, lsize) == 0)
8479 return (B_TRUE);
8480 return (B_FALSE);
8481 }
8482
8483 static uint64_t
zdb_decompress_block(abd_t * pabd,void * buf,void * lbuf,uint64_t lsize,uint64_t psize,int flags)8484 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
8485 uint64_t psize, int flags)
8486 {
8487 (void) buf;
8488 uint64_t orig_lsize = lsize;
8489 boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
8490 boolean_t found = B_FALSE;
8491 /*
8492 * We don't know how the data was compressed, so just try
8493 * every decompress function at every inflated blocksize.
8494 */
8495 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
8496 int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
8497 int *cfuncp = cfuncs;
8498 uint64_t maxlsize = SPA_MAXBLOCKSIZE;
8499 uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
8500 ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
8501 ZIO_COMPRESS_MASK(ZLE);
8502 *cfuncp++ = ZIO_COMPRESS_LZ4;
8503 *cfuncp++ = ZIO_COMPRESS_LZJB;
8504 mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
8505 /*
8506 * Every gzip level has the same decompressor, no need to
8507 * run it 9 times per bruteforce attempt.
8508 */
8509 mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);
8510 mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);
8511 mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);
8512 mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);
8513 for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
8514 if (((1ULL << c) & mask) == 0)
8515 *cfuncp++ = c;
8516
8517 /*
8518 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
8519 * could take a while and we should let the user know
8520 * we are not stuck. On the other hand, printing progress
8521 * info gets old after a while. User can specify 'v' flag
8522 * to see the progression.
8523 */
8524 if (lsize == psize)
8525 lsize += SPA_MINBLOCKSIZE;
8526 else
8527 maxlsize = lsize;
8528
8529 for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
8530 for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
8531 if (try_decompress_block(pabd, lsize, psize, flags,
8532 *cfuncp, lbuf, lbuf2)) {
8533 found = B_TRUE;
8534 break;
8535 }
8536 }
8537 if (*cfuncp != 0)
8538 break;
8539 }
8540 if (!found && tryzle) {
8541 for (lsize = orig_lsize; lsize <= maxlsize;
8542 lsize += SPA_MINBLOCKSIZE) {
8543 if (try_decompress_block(pabd, lsize, psize, flags,
8544 ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
8545 *cfuncp = ZIO_COMPRESS_ZLE;
8546 found = B_TRUE;
8547 break;
8548 }
8549 }
8550 }
8551 umem_free(lbuf2, SPA_MAXBLOCKSIZE);
8552
8553 if (*cfuncp == ZIO_COMPRESS_ZLE) {
8554 printf("\nZLE decompression was selected. If you "
8555 "suspect the results are wrong,\ntry avoiding ZLE "
8556 "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
8557 }
8558
8559 return (lsize > maxlsize ? -1 : lsize);
8560 }
8561
8562 /*
8563 * Read a block from a pool and print it out. The syntax of the
8564 * block descriptor is:
8565 *
8566 * pool:vdev_specifier:offset:[lsize/]psize[:flags]
8567 *
8568 * pool - The name of the pool you wish to read from
8569 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
8570 * offset - offset, in hex, in bytes
8571 * size - Amount of data to read, in hex, in bytes
8572 * flags - A string of characters specifying options
8573 * b: Decode a blkptr at given offset within block
8574 * c: Calculate and display checksums
8575 * d: Decompress data before dumping
8576 * e: Byteswap data before dumping
8577 * g: Display data as a gang block header
8578 * i: Display as an indirect block
8579 * r: Dump raw data to stdout
8580 * v: Verbose
8581 *
8582 */
8583 static void
zdb_read_block(char * thing,spa_t * spa)8584 zdb_read_block(char *thing, spa_t *spa)
8585 {
8586 blkptr_t blk, *bp = &blk;
8587 dva_t *dva = bp->blk_dva;
8588 int flags = 0;
8589 uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
8590 zio_t *zio;
8591 vdev_t *vd;
8592 abd_t *pabd;
8593 void *lbuf, *buf;
8594 char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
8595 const char *vdev, *errmsg = NULL;
8596 int i, error;
8597 boolean_t borrowed = B_FALSE, found = B_FALSE;
8598
8599 dup = strdup(thing);
8600 s = strtok_r(dup, ":", &tmp);
8601 vdev = s ?: "";
8602 s = strtok_r(NULL, ":", &tmp);
8603 offset = strtoull(s ? s : "", NULL, 16);
8604 sizes = strtok_r(NULL, ":", &tmp);
8605 s = strtok_r(NULL, ":", &tmp);
8606 flagstr = strdup(s ?: "");
8607
8608 if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
8609 errmsg = "invalid size(s)";
8610 if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
8611 errmsg = "size must be a multiple of sector size";
8612 if (!IS_P2ALIGNED(offset, DEV_BSIZE))
8613 errmsg = "offset must be a multiple of sector size";
8614 if (errmsg) {
8615 (void) printf("Invalid block specifier: %s - %s\n",
8616 thing, errmsg);
8617 goto done;
8618 }
8619
8620 tmp = NULL;
8621 for (s = strtok_r(flagstr, ":", &tmp);
8622 s != NULL;
8623 s = strtok_r(NULL, ":", &tmp)) {
8624 for (i = 0; i < strlen(flagstr); i++) {
8625 int bit = flagbits[(uchar_t)flagstr[i]];
8626
8627 if (bit == 0) {
8628 (void) printf("***Ignoring flag: %c\n",
8629 (uchar_t)flagstr[i]);
8630 continue;
8631 }
8632 found = B_TRUE;
8633 flags |= bit;
8634
8635 p = &flagstr[i + 1];
8636 if (*p != ':' && *p != '\0') {
8637 int j = 0, nextbit = flagbits[(uchar_t)*p];
8638 char *end, offstr[8] = { 0 };
8639 if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
8640 (nextbit == 0)) {
8641 /* look ahead to isolate the offset */
8642 while (nextbit == 0 &&
8643 strchr(flagbitstr, *p) == NULL) {
8644 offstr[j] = *p;
8645 j++;
8646 if (i + j > strlen(flagstr))
8647 break;
8648 p++;
8649 nextbit = flagbits[(uchar_t)*p];
8650 }
8651 blkptr_offset = strtoull(offstr, &end,
8652 16);
8653 i += j;
8654 } else if (nextbit == 0) {
8655 (void) printf("***Ignoring flag arg:"
8656 " '%c'\n", (uchar_t)*p);
8657 }
8658 }
8659 }
8660 }
8661 if (blkptr_offset % sizeof (blkptr_t)) {
8662 printf("Block pointer offset 0x%llx "
8663 "must be divisible by 0x%x\n",
8664 (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
8665 goto done;
8666 }
8667 if (found == B_FALSE && strlen(flagstr) > 0) {
8668 printf("Invalid flag arg: '%s'\n", flagstr);
8669 goto done;
8670 }
8671
8672 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
8673 if (vd == NULL) {
8674 (void) printf("***Invalid vdev: %s\n", vdev);
8675 goto done;
8676 } else {
8677 if (vd->vdev_path)
8678 (void) fprintf(stderr, "Found vdev: %s\n",
8679 vd->vdev_path);
8680 else
8681 (void) fprintf(stderr, "Found vdev type: %s\n",
8682 vd->vdev_ops->vdev_op_type);
8683 }
8684
8685 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
8686 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
8687
8688 BP_ZERO(bp);
8689
8690 DVA_SET_VDEV(&dva[0], vd->vdev_id);
8691 DVA_SET_OFFSET(&dva[0], offset);
8692 DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
8693 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
8694
8695 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
8696
8697 BP_SET_LSIZE(bp, lsize);
8698 BP_SET_PSIZE(bp, psize);
8699 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
8700 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
8701 BP_SET_TYPE(bp, DMU_OT_NONE);
8702 BP_SET_LEVEL(bp, 0);
8703 BP_SET_DEDUP(bp, 0);
8704 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
8705
8706 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8707 zio = zio_root(spa, NULL, NULL, 0);
8708
8709 if (vd == vd->vdev_top) {
8710 /*
8711 * Treat this as a normal block read.
8712 */
8713 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
8714 ZIO_PRIORITY_SYNC_READ,
8715 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
8716 } else {
8717 /*
8718 * Treat this as a vdev child I/O.
8719 */
8720 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
8721 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
8722 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
8723 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
8724 NULL, NULL));
8725 }
8726
8727 error = zio_wait(zio);
8728 spa_config_exit(spa, SCL_STATE, FTAG);
8729
8730 if (error) {
8731 (void) printf("Read of %s failed, error: %d\n", thing, error);
8732 goto out;
8733 }
8734
8735 uint64_t orig_lsize = lsize;
8736 buf = lbuf;
8737 if (flags & ZDB_FLAG_DECOMPRESS) {
8738 lsize = zdb_decompress_block(pabd, buf, lbuf,
8739 lsize, psize, flags);
8740 if (lsize == -1) {
8741 (void) printf("Decompress of %s failed\n", thing);
8742 goto out;
8743 }
8744 } else {
8745 buf = abd_borrow_buf_copy(pabd, lsize);
8746 borrowed = B_TRUE;
8747 }
8748 /*
8749 * Try to detect invalid block pointer. If invalid, try
8750 * decompressing.
8751 */
8752 if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
8753 !(flags & ZDB_FLAG_DECOMPRESS)) {
8754 const blkptr_t *b = (const blkptr_t *)(void *)
8755 ((uintptr_t)buf + (uintptr_t)blkptr_offset);
8756 if (zfs_blkptr_verify(spa, b,
8757 BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
8758 abd_return_buf_copy(pabd, buf, lsize);
8759 borrowed = B_FALSE;
8760 buf = lbuf;
8761 lsize = zdb_decompress_block(pabd, buf,
8762 lbuf, lsize, psize, flags);
8763 b = (const blkptr_t *)(void *)
8764 ((uintptr_t)buf + (uintptr_t)blkptr_offset);
8765 if (lsize == -1 || zfs_blkptr_verify(spa, b,
8766 BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
8767 printf("invalid block pointer at this DVA\n");
8768 goto out;
8769 }
8770 }
8771 }
8772
8773 if (flags & ZDB_FLAG_PRINT_BLKPTR)
8774 zdb_print_blkptr((blkptr_t *)(void *)
8775 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
8776 else if (flags & ZDB_FLAG_RAW)
8777 zdb_dump_block_raw(buf, lsize, flags);
8778 else if (flags & ZDB_FLAG_INDIRECT)
8779 zdb_dump_indirect((blkptr_t *)buf,
8780 orig_lsize / sizeof (blkptr_t), flags);
8781 else if (flags & ZDB_FLAG_GBH)
8782 zdb_dump_gbh(buf, flags);
8783 else
8784 zdb_dump_block(thing, buf, lsize, flags);
8785
8786 /*
8787 * If :c was specified, iterate through the checksum table to
8788 * calculate and display each checksum for our specified
8789 * DVA and length.
8790 */
8791 if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
8792 !(flags & ZDB_FLAG_GBH)) {
8793 zio_t *czio;
8794 (void) printf("\n");
8795 for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
8796 ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
8797
8798 if ((zio_checksum_table[ck].ci_flags &
8799 ZCHECKSUM_FLAG_EMBEDDED) ||
8800 ck == ZIO_CHECKSUM_NOPARITY) {
8801 continue;
8802 }
8803 BP_SET_CHECKSUM(bp, ck);
8804 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8805 czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
8806 if (vd == vd->vdev_top) {
8807 zio_nowait(zio_read(czio, spa, bp, pabd, psize,
8808 NULL, NULL,
8809 ZIO_PRIORITY_SYNC_READ,
8810 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8811 ZIO_FLAG_DONT_RETRY, NULL));
8812 } else {
8813 zio_nowait(zio_vdev_child_io(czio, bp, vd,
8814 offset, pabd, psize, ZIO_TYPE_READ,
8815 ZIO_PRIORITY_SYNC_READ,
8816 ZIO_FLAG_DONT_PROPAGATE |
8817 ZIO_FLAG_DONT_RETRY |
8818 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8819 ZIO_FLAG_SPECULATIVE |
8820 ZIO_FLAG_OPTIONAL, NULL, NULL));
8821 }
8822 error = zio_wait(czio);
8823 if (error == 0 || error == ECKSUM) {
8824 zio_t *ck_zio = zio_null(NULL, spa, NULL,
8825 NULL, NULL, 0);
8826 ck_zio->io_offset =
8827 DVA_GET_OFFSET(&bp->blk_dva[0]);
8828 ck_zio->io_bp = bp;
8829 zio_checksum_compute(ck_zio, ck, pabd, lsize);
8830 printf(
8831 "%12s\t"
8832 "cksum=%016llx:%016llx:%016llx:%016llx\n",
8833 zio_checksum_table[ck].ci_name,
8834 (u_longlong_t)bp->blk_cksum.zc_word[0],
8835 (u_longlong_t)bp->blk_cksum.zc_word[1],
8836 (u_longlong_t)bp->blk_cksum.zc_word[2],
8837 (u_longlong_t)bp->blk_cksum.zc_word[3]);
8838 zio_wait(ck_zio);
8839 } else {
8840 printf("error %d reading block\n", error);
8841 }
8842 spa_config_exit(spa, SCL_STATE, FTAG);
8843 }
8844 }
8845
8846 if (borrowed)
8847 abd_return_buf_copy(pabd, buf, lsize);
8848
8849 out:
8850 abd_free(pabd);
8851 umem_free(lbuf, SPA_MAXBLOCKSIZE);
8852 done:
8853 free(flagstr);
8854 free(dup);
8855 }
8856
8857 static void
zdb_embedded_block(char * thing)8858 zdb_embedded_block(char *thing)
8859 {
8860 blkptr_t bp = {{{{0}}}};
8861 unsigned long long *words = (void *)&bp;
8862 char *buf;
8863 int err;
8864
8865 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
8866 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
8867 words + 0, words + 1, words + 2, words + 3,
8868 words + 4, words + 5, words + 6, words + 7,
8869 words + 8, words + 9, words + 10, words + 11,
8870 words + 12, words + 13, words + 14, words + 15);
8871 if (err != 16) {
8872 (void) fprintf(stderr, "invalid input format\n");
8873 zdb_exit(1);
8874 }
8875 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
8876 buf = malloc(SPA_MAXBLOCKSIZE);
8877 if (buf == NULL) {
8878 (void) fprintf(stderr, "out of memory\n");
8879 zdb_exit(1);
8880 }
8881 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
8882 if (err != 0) {
8883 (void) fprintf(stderr, "decode failed: %u\n", err);
8884 zdb_exit(1);
8885 }
8886 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
8887 free(buf);
8888 }
8889
8890 /* check for valid hex or decimal numeric string */
8891 static boolean_t
zdb_numeric(char * str)8892 zdb_numeric(char *str)
8893 {
8894 int i = 0;
8895
8896 if (strlen(str) == 0)
8897 return (B_FALSE);
8898 if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
8899 i = 2;
8900 for (; i < strlen(str); i++) {
8901 if (!isxdigit(str[i]))
8902 return (B_FALSE);
8903 }
8904 return (B_TRUE);
8905 }
8906
8907 int
main(int argc,char ** argv)8908 main(int argc, char **argv)
8909 {
8910 int c;
8911 int dump_all = 1;
8912 int verbose = 0;
8913 int error = 0;
8914 char **searchdirs = NULL;
8915 int nsearch = 0;
8916 char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
8917 nvlist_t *policy = NULL;
8918 uint64_t max_txg = UINT64_MAX;
8919 int64_t objset_id = -1;
8920 uint64_t object;
8921 int flags = ZFS_IMPORT_MISSING_LOG;
8922 int rewind = ZPOOL_NEVER_REWIND;
8923 char *spa_config_path_env, *objset_str;
8924 boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
8925 nvlist_t *cfg = NULL;
8926 struct sigaction action;
8927
8928 dprintf_setup(&argc, argv);
8929
8930 /*
8931 * Set up signal handlers, so if we crash due to bad on-disk data we
8932 * can get more info. Unlike ztest, we don't bail out if we can't set
8933 * up signal handlers, because zdb is very useful without them.
8934 */
8935 action.sa_handler = sig_handler;
8936 sigemptyset(&action.sa_mask);
8937 action.sa_flags = 0;
8938 if (sigaction(SIGSEGV, &action, NULL) < 0) {
8939 (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",
8940 strerror(errno));
8941 }
8942 if (sigaction(SIGABRT, &action, NULL) < 0) {
8943 (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",
8944 strerror(errno));
8945 }
8946
8947 /*
8948 * If there is an environment variable SPA_CONFIG_PATH it overrides
8949 * default spa_config_path setting. If -U flag is specified it will
8950 * override this environment variable settings once again.
8951 */
8952 spa_config_path_env = getenv("SPA_CONFIG_PATH");
8953 if (spa_config_path_env != NULL)
8954 spa_config_path = spa_config_path_env;
8955
8956 /*
8957 * For performance reasons, we set this tunable down. We do so before
8958 * the arg parsing section so that the user can override this value if
8959 * they choose.
8960 */
8961 zfs_btree_verify_intensity = 3;
8962
8963 struct option long_options[] = {
8964 {"ignore-assertions", no_argument, NULL, 'A'},
8965 {"block-stats", no_argument, NULL, 'b'},
8966 {"backup", no_argument, NULL, 'B'},
8967 {"checksum", no_argument, NULL, 'c'},
8968 {"config", no_argument, NULL, 'C'},
8969 {"datasets", no_argument, NULL, 'd'},
8970 {"dedup-stats", no_argument, NULL, 'D'},
8971 {"exported", no_argument, NULL, 'e'},
8972 {"embedded-block-pointer", no_argument, NULL, 'E'},
8973 {"automatic-rewind", no_argument, NULL, 'F'},
8974 {"dump-debug-msg", no_argument, NULL, 'G'},
8975 {"history", no_argument, NULL, 'h'},
8976 {"intent-logs", no_argument, NULL, 'i'},
8977 {"inflight", required_argument, NULL, 'I'},
8978 {"checkpointed-state", no_argument, NULL, 'k'},
8979 {"key", required_argument, NULL, 'K'},
8980 {"label", no_argument, NULL, 'l'},
8981 {"disable-leak-tracking", no_argument, NULL, 'L'},
8982 {"metaslabs", no_argument, NULL, 'm'},
8983 {"metaslab-groups", no_argument, NULL, 'M'},
8984 {"numeric", no_argument, NULL, 'N'},
8985 {"option", required_argument, NULL, 'o'},
8986 {"object-lookups", no_argument, NULL, 'O'},
8987 {"path", required_argument, NULL, 'p'},
8988 {"parseable", no_argument, NULL, 'P'},
8989 {"skip-label", no_argument, NULL, 'q'},
8990 {"copy-object", no_argument, NULL, 'r'},
8991 {"read-block", no_argument, NULL, 'R'},
8992 {"io-stats", no_argument, NULL, 's'},
8993 {"simulate-dedup", no_argument, NULL, 'S'},
8994 {"txg", required_argument, NULL, 't'},
8995 {"brt-stats", no_argument, NULL, 'T'},
8996 {"uberblock", no_argument, NULL, 'u'},
8997 {"cachefile", required_argument, NULL, 'U'},
8998 {"verbose", no_argument, NULL, 'v'},
8999 {"verbatim", no_argument, NULL, 'V'},
9000 {"dump-blocks", required_argument, NULL, 'x'},
9001 {"extreme-rewind", no_argument, NULL, 'X'},
9002 {"all-reconstruction", no_argument, NULL, 'Y'},
9003 {"livelist", no_argument, NULL, 'y'},
9004 {"zstd-headers", no_argument, NULL, 'Z'},
9005 {0, 0, 0, 0}
9006 };
9007
9008 while ((c = getopt_long(argc, argv,
9009 "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
9010 long_options, NULL)) != -1) {
9011 switch (c) {
9012 case 'b':
9013 case 'B':
9014 case 'c':
9015 case 'C':
9016 case 'd':
9017 case 'D':
9018 case 'E':
9019 case 'G':
9020 case 'h':
9021 case 'i':
9022 case 'l':
9023 case 'm':
9024 case 'M':
9025 case 'N':
9026 case 'O':
9027 case 'r':
9028 case 'R':
9029 case 's':
9030 case 'S':
9031 case 'T':
9032 case 'u':
9033 case 'y':
9034 case 'Z':
9035 dump_opt[c]++;
9036 dump_all = 0;
9037 break;
9038 case 'A':
9039 case 'e':
9040 case 'F':
9041 case 'k':
9042 case 'L':
9043 case 'P':
9044 case 'q':
9045 case 'X':
9046 dump_opt[c]++;
9047 break;
9048 case 'Y':
9049 zfs_reconstruct_indirect_combinations_max = INT_MAX;
9050 zfs_deadman_enabled = 0;
9051 break;
9052 /* NB: Sort single match options below. */
9053 case 'I':
9054 max_inflight_bytes = strtoull(optarg, NULL, 0);
9055 if (max_inflight_bytes == 0) {
9056 (void) fprintf(stderr, "maximum number "
9057 "of inflight bytes must be greater "
9058 "than 0\n");
9059 usage();
9060 }
9061 break;
9062 case 'K':
9063 dump_opt[c]++;
9064 key_material = strdup(optarg);
9065 /* redact key material in process table */
9066 while (*optarg != '\0') { *optarg++ = '*'; }
9067 break;
9068 case 'o':
9069 error = set_global_var(optarg);
9070 if (error != 0)
9071 usage();
9072 break;
9073 case 'p':
9074 if (searchdirs == NULL) {
9075 searchdirs = umem_alloc(sizeof (char *),
9076 UMEM_NOFAIL);
9077 } else {
9078 char **tmp = umem_alloc((nsearch + 1) *
9079 sizeof (char *), UMEM_NOFAIL);
9080 memcpy(tmp, searchdirs, nsearch *
9081 sizeof (char *));
9082 umem_free(searchdirs,
9083 nsearch * sizeof (char *));
9084 searchdirs = tmp;
9085 }
9086 searchdirs[nsearch++] = optarg;
9087 break;
9088 case 't':
9089 max_txg = strtoull(optarg, NULL, 0);
9090 if (max_txg < TXG_INITIAL) {
9091 (void) fprintf(stderr, "incorrect txg "
9092 "specified: %s\n", optarg);
9093 usage();
9094 }
9095 break;
9096 case 'U':
9097 spa_config_path = optarg;
9098 if (spa_config_path[0] != '/') {
9099 (void) fprintf(stderr,
9100 "cachefile must be an absolute path "
9101 "(i.e. start with a slash)\n");
9102 usage();
9103 }
9104 break;
9105 case 'v':
9106 verbose++;
9107 break;
9108 case 'V':
9109 flags = ZFS_IMPORT_VERBATIM;
9110 break;
9111 case 'x':
9112 vn_dumpdir = optarg;
9113 break;
9114 default:
9115 usage();
9116 break;
9117 }
9118 }
9119
9120 if (!dump_opt['e'] && searchdirs != NULL) {
9121 (void) fprintf(stderr, "-p option requires use of -e\n");
9122 usage();
9123 }
9124 #if defined(_LP64)
9125 /*
9126 * ZDB does not typically re-read blocks; therefore limit the ARC
9127 * to 256 MB, which can be used entirely for metadata.
9128 */
9129 zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
9130 zfs_arc_max = 256 * 1024 * 1024;
9131 #endif
9132
9133 /*
9134 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
9135 * "zdb -b" uses traversal prefetch which uses async reads.
9136 * For good performance, let several of them be active at once.
9137 */
9138 zfs_vdev_async_read_max_active = 10;
9139
9140 /*
9141 * Disable reference tracking for better performance.
9142 */
9143 reference_tracking_enable = B_FALSE;
9144
9145 /*
9146 * Do not fail spa_load when spa_load_verify fails. This is needed
9147 * to load non-idle pools.
9148 */
9149 spa_load_verify_dryrun = B_TRUE;
9150
9151 /*
9152 * ZDB should have ability to read spacemaps.
9153 */
9154 spa_mode_readable_spacemaps = B_TRUE;
9155
9156 kernel_init(SPA_MODE_READ);
9157 kernel_init_done = B_TRUE;
9158
9159 if (dump_all)
9160 verbose = MAX(verbose, 1);
9161
9162 for (c = 0; c < 256; c++) {
9163 if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
9164 dump_opt[c] = 1;
9165 if (dump_opt[c])
9166 dump_opt[c] += verbose;
9167 }
9168
9169 libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
9170 zfs_recover = (dump_opt['A'] > 1);
9171
9172 argc -= optind;
9173 argv += optind;
9174 if (argc < 2 && dump_opt['R'])
9175 usage();
9176
9177 if (dump_opt['E']) {
9178 if (argc != 1)
9179 usage();
9180 zdb_embedded_block(argv[0]);
9181 error = 0;
9182 goto fini;
9183 }
9184
9185 if (argc < 1) {
9186 if (!dump_opt['e'] && dump_opt['C']) {
9187 dump_cachefile(spa_config_path);
9188 error = 0;
9189 goto fini;
9190 }
9191 usage();
9192 }
9193
9194 if (dump_opt['l']) {
9195 error = dump_label(argv[0]);
9196 goto fini;
9197 }
9198
9199 if (dump_opt['X'] || dump_opt['F'])
9200 rewind = ZPOOL_DO_REWIND |
9201 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
9202
9203 /* -N implies -d */
9204 if (dump_opt['N'] && dump_opt['d'] == 0)
9205 dump_opt['d'] = dump_opt['N'];
9206
9207 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
9208 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
9209 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
9210 fatal("internal error: %s", strerror(ENOMEM));
9211
9212 error = 0;
9213 target = argv[0];
9214
9215 if (strpbrk(target, "/@") != NULL) {
9216 size_t targetlen;
9217
9218 target_pool = strdup(target);
9219 *strpbrk(target_pool, "/@") = '\0';
9220
9221 target_is_spa = B_FALSE;
9222 targetlen = strlen(target);
9223 if (targetlen && target[targetlen - 1] == '/')
9224 target[targetlen - 1] = '\0';
9225
9226 /*
9227 * See if an objset ID was supplied (-d <pool>/<objset ID>).
9228 * To disambiguate tank/100, consider the 100 as objsetID
9229 * if -N was given, otherwise 100 is an objsetID iff
9230 * tank/100 as a named dataset fails on lookup.
9231 */
9232 objset_str = strchr(target, '/');
9233 if (objset_str && strlen(objset_str) > 1 &&
9234 zdb_numeric(objset_str + 1)) {
9235 char *endptr;
9236 errno = 0;
9237 objset_str++;
9238 objset_id = strtoull(objset_str, &endptr, 0);
9239 /* dataset 0 is the same as opening the pool */
9240 if (errno == 0 && endptr != objset_str &&
9241 objset_id != 0) {
9242 if (dump_opt['N'])
9243 dataset_lookup = B_TRUE;
9244 }
9245 /* normal dataset name not an objset ID */
9246 if (endptr == objset_str) {
9247 objset_id = -1;
9248 }
9249 } else if (objset_str && !zdb_numeric(objset_str + 1) &&
9250 dump_opt['N']) {
9251 printf("Supply a numeric objset ID with -N\n");
9252 error = 1;
9253 goto fini;
9254 }
9255 } else {
9256 target_pool = target;
9257 }
9258
9259 if (dump_opt['e']) {
9260 importargs_t args = { 0 };
9261
9262 args.paths = nsearch;
9263 args.path = searchdirs;
9264 args.can_be_active = B_TRUE;
9265
9266 libpc_handle_t lpch = {
9267 .lpc_lib_handle = NULL,
9268 .lpc_ops = &libzpool_config_ops,
9269 .lpc_printerr = B_TRUE
9270 };
9271 error = zpool_find_config(&lpch, target_pool, &cfg, &args);
9272
9273 if (error == 0) {
9274
9275 if (nvlist_add_nvlist(cfg,
9276 ZPOOL_LOAD_POLICY, policy) != 0) {
9277 fatal("can't open '%s': %s",
9278 target, strerror(ENOMEM));
9279 }
9280
9281 if (dump_opt['C'] > 1) {
9282 (void) printf("\nConfiguration for import:\n");
9283 dump_nvlist(cfg, 8);
9284 }
9285
9286 /*
9287 * Disable the activity check to allow examination of
9288 * active pools.
9289 */
9290 error = spa_import(target_pool, cfg, NULL,
9291 flags | ZFS_IMPORT_SKIP_MMP);
9292 }
9293 }
9294
9295 if (searchdirs != NULL) {
9296 umem_free(searchdirs, nsearch * sizeof (char *));
9297 searchdirs = NULL;
9298 }
9299
9300 /*
9301 * We need to make sure to process -O option or call
9302 * dump_path after the -e option has been processed,
9303 * which imports the pool to the namespace if it's
9304 * not in the cachefile.
9305 */
9306 if (dump_opt['O']) {
9307 if (argc != 2)
9308 usage();
9309 dump_opt['v'] = verbose + 3;
9310 error = dump_path(argv[0], argv[1], NULL);
9311 goto fini;
9312 }
9313
9314 if (dump_opt['r']) {
9315 target_is_spa = B_FALSE;
9316 if (argc != 3)
9317 usage();
9318 dump_opt['v'] = verbose;
9319 error = dump_path(argv[0], argv[1], &object);
9320 if (error != 0)
9321 fatal("internal error: %s", strerror(error));
9322 }
9323
9324 /*
9325 * import_checkpointed_state makes the assumption that the
9326 * target pool that we pass it is already part of the spa
9327 * namespace. Because of that we need to make sure to call
9328 * it always after the -e option has been processed, which
9329 * imports the pool to the namespace if it's not in the
9330 * cachefile.
9331 */
9332 char *checkpoint_pool = NULL;
9333 char *checkpoint_target = NULL;
9334 if (dump_opt['k']) {
9335 checkpoint_pool = import_checkpointed_state(target, cfg,
9336 &checkpoint_target);
9337
9338 if (checkpoint_target != NULL)
9339 target = checkpoint_target;
9340 }
9341
9342 if (cfg != NULL) {
9343 nvlist_free(cfg);
9344 cfg = NULL;
9345 }
9346
9347 if (target_pool != target)
9348 free(target_pool);
9349
9350 if (error == 0) {
9351 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
9352 ASSERT(checkpoint_pool != NULL);
9353 ASSERT(checkpoint_target == NULL);
9354
9355 error = spa_open(checkpoint_pool, &spa, FTAG);
9356 if (error != 0) {
9357 fatal("Tried to open pool \"%s\" but "
9358 "spa_open() failed with error %d\n",
9359 checkpoint_pool, error);
9360 }
9361
9362 } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
9363 objset_id == 0) {
9364 zdb_set_skip_mmp(target);
9365 error = spa_open_rewind(target, &spa, FTAG, policy,
9366 NULL);
9367 if (error) {
9368 /*
9369 * If we're missing the log device then
9370 * try opening the pool after clearing the
9371 * log state.
9372 */
9373 mutex_enter(&spa_namespace_lock);
9374 if ((spa = spa_lookup(target)) != NULL &&
9375 spa->spa_log_state == SPA_LOG_MISSING) {
9376 spa->spa_log_state = SPA_LOG_CLEAR;
9377 error = 0;
9378 }
9379 mutex_exit(&spa_namespace_lock);
9380
9381 if (!error) {
9382 error = spa_open_rewind(target, &spa,
9383 FTAG, policy, NULL);
9384 }
9385 }
9386 } else if (strpbrk(target, "#") != NULL) {
9387 dsl_pool_t *dp;
9388 error = dsl_pool_hold(target, FTAG, &dp);
9389 if (error != 0) {
9390 fatal("can't dump '%s': %s", target,
9391 strerror(error));
9392 }
9393 error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
9394 dsl_pool_rele(dp, FTAG);
9395 if (error != 0) {
9396 fatal("can't dump '%s': %s", target,
9397 strerror(error));
9398 }
9399 goto fini;
9400 } else {
9401 target_pool = strdup(target);
9402 if (strpbrk(target, "/@") != NULL)
9403 *strpbrk(target_pool, "/@") = '\0';
9404
9405 zdb_set_skip_mmp(target);
9406 /*
9407 * If -N was supplied, the user has indicated that
9408 * zdb -d <pool>/<objsetID> is in effect. Otherwise
9409 * we first assume that the dataset string is the
9410 * dataset name. If dmu_objset_hold fails with the
9411 * dataset string, and we have an objset_id, retry the
9412 * lookup with the objsetID.
9413 */
9414 boolean_t retry = B_TRUE;
9415 retry_lookup:
9416 if (dataset_lookup == B_TRUE) {
9417 /*
9418 * Use the supplied id to get the name
9419 * for open_objset.
9420 */
9421 error = spa_open(target_pool, &spa, FTAG);
9422 if (error == 0) {
9423 error = name_from_objset_id(spa,
9424 objset_id, dsname);
9425 spa_close(spa, FTAG);
9426 if (error == 0)
9427 target = dsname;
9428 }
9429 }
9430 if (error == 0) {
9431 if (objset_id > 0 && retry) {
9432 int err = dmu_objset_hold(target, FTAG,
9433 &os);
9434 if (err) {
9435 dataset_lookup = B_TRUE;
9436 retry = B_FALSE;
9437 goto retry_lookup;
9438 } else {
9439 dmu_objset_rele(os, FTAG);
9440 }
9441 }
9442 error = open_objset(target, FTAG, &os);
9443 }
9444 if (error == 0)
9445 spa = dmu_objset_spa(os);
9446 free(target_pool);
9447 }
9448 }
9449 nvlist_free(policy);
9450
9451 if (error)
9452 fatal("can't open '%s': %s", target, strerror(error));
9453
9454 /*
9455 * Set the pool failure mode to panic in order to prevent the pool
9456 * from suspending. A suspended I/O will have no way to resume and
9457 * can prevent the zdb(8) command from terminating as expected.
9458 */
9459 if (spa != NULL)
9460 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
9461
9462 argv++;
9463 argc--;
9464 if (dump_opt['r']) {
9465 error = zdb_copy_object(os, object, argv[1]);
9466 } else if (!dump_opt['R']) {
9467 flagbits['d'] = ZOR_FLAG_DIRECTORY;
9468 flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
9469 flagbits['m'] = ZOR_FLAG_SPACE_MAP;
9470 flagbits['z'] = ZOR_FLAG_ZAP;
9471 flagbits['A'] = ZOR_FLAG_ALL_TYPES;
9472
9473 if (argc > 0 && dump_opt['d']) {
9474 zopt_object_args = argc;
9475 zopt_object_ranges = calloc(zopt_object_args,
9476 sizeof (zopt_object_range_t));
9477 for (unsigned i = 0; i < zopt_object_args; i++) {
9478 int err;
9479 const char *msg = NULL;
9480
9481 err = parse_object_range(argv[i],
9482 &zopt_object_ranges[i], &msg);
9483 if (err != 0)
9484 fatal("Bad object or range: '%s': %s\n",
9485 argv[i], msg ?: "");
9486 }
9487 } else if (argc > 0 && dump_opt['m']) {
9488 zopt_metaslab_args = argc;
9489 zopt_metaslab = calloc(zopt_metaslab_args,
9490 sizeof (uint64_t));
9491 for (unsigned i = 0; i < zopt_metaslab_args; i++) {
9492 errno = 0;
9493 zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
9494 if (zopt_metaslab[i] == 0 && errno != 0)
9495 fatal("bad number %s: %s", argv[i],
9496 strerror(errno));
9497 }
9498 }
9499 if (dump_opt['B']) {
9500 dump_backup(target, objset_id,
9501 argc > 0 ? argv[0] : NULL);
9502 } else if (os != NULL) {
9503 dump_objset(os);
9504 } else if (zopt_object_args > 0 && !dump_opt['m']) {
9505 dump_objset(spa->spa_meta_objset);
9506 } else {
9507 dump_zpool(spa);
9508 }
9509 } else {
9510 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
9511 flagbits['c'] = ZDB_FLAG_CHECKSUM;
9512 flagbits['d'] = ZDB_FLAG_DECOMPRESS;
9513 flagbits['e'] = ZDB_FLAG_BSWAP;
9514 flagbits['g'] = ZDB_FLAG_GBH;
9515 flagbits['i'] = ZDB_FLAG_INDIRECT;
9516 flagbits['r'] = ZDB_FLAG_RAW;
9517 flagbits['v'] = ZDB_FLAG_VERBOSE;
9518
9519 for (int i = 0; i < argc; i++)
9520 zdb_read_block(argv[i], spa);
9521 }
9522
9523 if (dump_opt['k']) {
9524 free(checkpoint_pool);
9525 if (!target_is_spa)
9526 free(checkpoint_target);
9527 }
9528
9529 fini:
9530 if (os != NULL) {
9531 close_objset(os, FTAG);
9532 } else if (spa != NULL) {
9533 spa_close(spa, FTAG);
9534 }
9535
9536 fuid_table_destroy();
9537
9538 dump_debug_buffer();
9539
9540 if (kernel_init_done)
9541 kernel_fini();
9542
9543 return (error);
9544 }
9545