1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright (c) 2019, Klara Inc.
28 * Copyright (c) 2019, Allan Jude
29 * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
30 */
31
32 #include <sys/zfs_context.h>
33 #include <sys/arc.h>
34 #include <sys/dmu.h>
35 #include <sys/dmu_send.h>
36 #include <sys/dmu_impl.h>
37 #include <sys/dbuf.h>
38 #include <sys/dmu_objset.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/dsl_dir.h>
41 #include <sys/dmu_tx.h>
42 #include <sys/spa.h>
43 #include <sys/zio.h>
44 #include <sys/dmu_zfetch.h>
45 #include <sys/sa.h>
46 #include <sys/sa_impl.h>
47 #include <sys/zfeature.h>
48 #include <sys/blkptr.h>
49 #include <sys/range_tree.h>
50 #include <sys/trace_zfs.h>
51 #include <sys/callb.h>
52 #include <sys/abd.h>
53 #include <sys/brt.h>
54 #include <sys/vdev.h>
55 #include <cityhash.h>
56 #include <sys/spa_impl.h>
57 #include <sys/wmsum.h>
58 #include <sys/vdev_impl.h>
59
60 static kstat_t *dbuf_ksp;
61
62 typedef struct dbuf_stats {
63 /*
64 * Various statistics about the size of the dbuf cache.
65 */
66 kstat_named_t cache_count;
67 kstat_named_t cache_size_bytes;
68 kstat_named_t cache_size_bytes_max;
69 /*
70 * Statistics regarding the bounds on the dbuf cache size.
71 */
72 kstat_named_t cache_target_bytes;
73 kstat_named_t cache_lowater_bytes;
74 kstat_named_t cache_hiwater_bytes;
75 /*
76 * Total number of dbuf cache evictions that have occurred.
77 */
78 kstat_named_t cache_total_evicts;
79 /*
80 * The distribution of dbuf levels in the dbuf cache and
81 * the total size of all dbufs at each level.
82 */
83 kstat_named_t cache_levels[DN_MAX_LEVELS];
84 kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
85 /*
86 * Statistics about the dbuf hash table.
87 */
88 kstat_named_t hash_hits;
89 kstat_named_t hash_misses;
90 kstat_named_t hash_collisions;
91 kstat_named_t hash_elements;
92 kstat_named_t hash_elements_max;
93 /*
94 * Number of sublists containing more than one dbuf in the dbuf
95 * hash table. Keep track of the longest hash chain.
96 */
97 kstat_named_t hash_chains;
98 kstat_named_t hash_chain_max;
99 /*
100 * Number of times a dbuf_create() discovers that a dbuf was
101 * already created and in the dbuf hash table.
102 */
103 kstat_named_t hash_insert_race;
104 /*
105 * Number of entries in the hash table dbuf and mutex arrays.
106 */
107 kstat_named_t hash_table_count;
108 kstat_named_t hash_mutex_count;
109 /*
110 * Statistics about the size of the metadata dbuf cache.
111 */
112 kstat_named_t metadata_cache_count;
113 kstat_named_t metadata_cache_size_bytes;
114 kstat_named_t metadata_cache_size_bytes_max;
115 /*
116 * For diagnostic purposes, this is incremented whenever we can't add
117 * something to the metadata cache because it's full, and instead put
118 * the data in the regular dbuf cache.
119 */
120 kstat_named_t metadata_cache_overflow;
121 } dbuf_stats_t;
122
123 dbuf_stats_t dbuf_stats = {
124 { "cache_count", KSTAT_DATA_UINT64 },
125 { "cache_size_bytes", KSTAT_DATA_UINT64 },
126 { "cache_size_bytes_max", KSTAT_DATA_UINT64 },
127 { "cache_target_bytes", KSTAT_DATA_UINT64 },
128 { "cache_lowater_bytes", KSTAT_DATA_UINT64 },
129 { "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
130 { "cache_total_evicts", KSTAT_DATA_UINT64 },
131 { { "cache_levels_N", KSTAT_DATA_UINT64 } },
132 { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
133 { "hash_hits", KSTAT_DATA_UINT64 },
134 { "hash_misses", KSTAT_DATA_UINT64 },
135 { "hash_collisions", KSTAT_DATA_UINT64 },
136 { "hash_elements", KSTAT_DATA_UINT64 },
137 { "hash_elements_max", KSTAT_DATA_UINT64 },
138 { "hash_chains", KSTAT_DATA_UINT64 },
139 { "hash_chain_max", KSTAT_DATA_UINT64 },
140 { "hash_insert_race", KSTAT_DATA_UINT64 },
141 { "hash_table_count", KSTAT_DATA_UINT64 },
142 { "hash_mutex_count", KSTAT_DATA_UINT64 },
143 { "metadata_cache_count", KSTAT_DATA_UINT64 },
144 { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
145 { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
146 { "metadata_cache_overflow", KSTAT_DATA_UINT64 }
147 };
148
149 struct {
150 wmsum_t cache_count;
151 wmsum_t cache_total_evicts;
152 wmsum_t cache_levels[DN_MAX_LEVELS];
153 wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
154 wmsum_t hash_hits;
155 wmsum_t hash_misses;
156 wmsum_t hash_collisions;
157 wmsum_t hash_chains;
158 wmsum_t hash_insert_race;
159 wmsum_t metadata_cache_count;
160 wmsum_t metadata_cache_overflow;
161 } dbuf_sums;
162
163 #define DBUF_STAT_INCR(stat, val) \
164 wmsum_add(&dbuf_sums.stat, val)
165 #define DBUF_STAT_DECR(stat, val) \
166 DBUF_STAT_INCR(stat, -(val))
167 #define DBUF_STAT_BUMP(stat) \
168 DBUF_STAT_INCR(stat, 1)
169 #define DBUF_STAT_BUMPDOWN(stat) \
170 DBUF_STAT_INCR(stat, -1)
171 #define DBUF_STAT_MAX(stat, v) { \
172 uint64_t _m; \
173 while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
174 (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
175 continue; \
176 }
177
178 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
179 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
180
181 /*
182 * Global data structures and functions for the dbuf cache.
183 */
184 static kmem_cache_t *dbuf_kmem_cache;
185 static taskq_t *dbu_evict_taskq;
186
187 static kthread_t *dbuf_cache_evict_thread;
188 static kmutex_t dbuf_evict_lock;
189 static kcondvar_t dbuf_evict_cv;
190 static boolean_t dbuf_evict_thread_exit;
191
192 /*
193 * There are two dbuf caches; each dbuf can only be in one of them at a time.
194 *
195 * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
196 * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
197 * that represent the metadata that describes filesystems/snapshots/
198 * bookmarks/properties/etc. We only evict from this cache when we export a
199 * pool, to short-circuit as much I/O as possible for all administrative
200 * commands that need the metadata. There is no eviction policy for this
201 * cache, because we try to only include types in it which would occupy a
202 * very small amount of space per object but create a large impact on the
203 * performance of these commands. Instead, after it reaches a maximum size
204 * (which should only happen on very small memory systems with a very large
205 * number of filesystem objects), we stop taking new dbufs into the
206 * metadata cache, instead putting them in the normal dbuf cache.
207 *
208 * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
209 * are not currently held but have been recently released. These dbufs
210 * are not eligible for arc eviction until they are aged out of the cache.
211 * Dbufs that are aged out of the cache will be immediately destroyed and
212 * become eligible for arc eviction.
213 *
214 * Dbufs are added to these caches once the last hold is released. If a dbuf is
215 * later accessed and still exists in the dbuf cache, then it will be removed
216 * from the cache and later re-added to the head of the cache.
217 *
218 * If a given dbuf meets the requirements for the metadata cache, it will go
219 * there, otherwise it will be considered for the generic LRU dbuf cache. The
220 * caches and the refcounts tracking their sizes are stored in an array indexed
221 * by those caches' matching enum values (from dbuf_cached_state_t).
222 */
223 typedef struct dbuf_cache {
224 multilist_t cache;
225 zfs_refcount_t size ____cacheline_aligned;
226 } dbuf_cache_t;
227 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
228
229 /* Size limits for the caches */
230 static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
231 static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
232
233 /* Set the default sizes of the caches to log2 fraction of arc size */
234 static uint_t dbuf_cache_shift = 5;
235 static uint_t dbuf_metadata_cache_shift = 6;
236
237 /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
238 static uint_t dbuf_mutex_cache_shift = 0;
239
240 static unsigned long dbuf_cache_target_bytes(void);
241 static unsigned long dbuf_metadata_cache_target_bytes(void);
242
243 /*
244 * The LRU dbuf cache uses a three-stage eviction policy:
245 * - A low water marker designates when the dbuf eviction thread
246 * should stop evicting from the dbuf cache.
247 * - When we reach the maximum size (aka mid water mark), we
248 * signal the eviction thread to run.
249 * - The high water mark indicates when the eviction thread
250 * is unable to keep up with the incoming load and eviction must
251 * happen in the context of the calling thread.
252 *
253 * The dbuf cache:
254 * (max size)
255 * low water mid water hi water
256 * +----------------------------------------+----------+----------+
257 * | | | |
258 * | | | |
259 * | | | |
260 * | | | |
261 * +----------------------------------------+----------+----------+
262 * stop signal evict
263 * evicting eviction directly
264 * thread
265 *
266 * The high and low water marks indicate the operating range for the eviction
267 * thread. The low water mark is, by default, 90% of the total size of the
268 * cache and the high water mark is at 110% (both of these percentages can be
269 * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
270 * respectively). The eviction thread will try to ensure that the cache remains
271 * within this range by waking up every second and checking if the cache is
272 * above the low water mark. The thread can also be woken up by callers adding
273 * elements into the cache if the cache is larger than the mid water (i.e max
274 * cache size). Once the eviction thread is woken up and eviction is required,
275 * it will continue evicting buffers until it's able to reduce the cache size
276 * to the low water mark. If the cache size continues to grow and hits the high
277 * water mark, then callers adding elements to the cache will begin to evict
278 * directly from the cache until the cache is no longer above the high water
279 * mark.
280 */
281
282 /*
283 * The percentage above and below the maximum cache size.
284 */
285 static uint_t dbuf_cache_hiwater_pct = 10;
286 static uint_t dbuf_cache_lowater_pct = 10;
287
288 static int
dbuf_cons(void * vdb,void * unused,int kmflag)289 dbuf_cons(void *vdb, void *unused, int kmflag)
290 {
291 (void) unused, (void) kmflag;
292 dmu_buf_impl_t *db = vdb;
293 memset(db, 0, sizeof (dmu_buf_impl_t));
294
295 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
296 rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
297 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
298 multilist_link_init(&db->db_cache_link);
299 zfs_refcount_create(&db->db_holds);
300
301 return (0);
302 }
303
304 static void
dbuf_dest(void * vdb,void * unused)305 dbuf_dest(void *vdb, void *unused)
306 {
307 (void) unused;
308 dmu_buf_impl_t *db = vdb;
309 mutex_destroy(&db->db_mtx);
310 rw_destroy(&db->db_rwlock);
311 cv_destroy(&db->db_changed);
312 ASSERT(!multilist_link_active(&db->db_cache_link));
313 zfs_refcount_destroy(&db->db_holds);
314 }
315
316 /*
317 * dbuf hash table routines
318 */
319 static dbuf_hash_table_t dbuf_hash_table;
320
321 /*
322 * We use Cityhash for this. It's fast, and has good hash properties without
323 * requiring any large static buffers.
324 */
325 static uint64_t
dbuf_hash(void * os,uint64_t obj,uint8_t lvl,uint64_t blkid)326 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
327 {
328 return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
329 }
330
331 #define DTRACE_SET_STATE(db, why) \
332 DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \
333 const char *, why)
334
335 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
336 ((dbuf)->db.db_object == (obj) && \
337 (dbuf)->db_objset == (os) && \
338 (dbuf)->db_level == (level) && \
339 (dbuf)->db_blkid == (blkid))
340
341 dmu_buf_impl_t *
dbuf_find(objset_t * os,uint64_t obj,uint8_t level,uint64_t blkid,uint64_t * hash_out)342 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
343 uint64_t *hash_out)
344 {
345 dbuf_hash_table_t *h = &dbuf_hash_table;
346 uint64_t hv;
347 uint64_t idx;
348 dmu_buf_impl_t *db;
349
350 hv = dbuf_hash(os, obj, level, blkid);
351 idx = hv & h->hash_table_mask;
352
353 mutex_enter(DBUF_HASH_MUTEX(h, idx));
354 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
355 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
356 mutex_enter(&db->db_mtx);
357 if (db->db_state != DB_EVICTING) {
358 mutex_exit(DBUF_HASH_MUTEX(h, idx));
359 return (db);
360 }
361 mutex_exit(&db->db_mtx);
362 }
363 }
364 mutex_exit(DBUF_HASH_MUTEX(h, idx));
365 if (hash_out != NULL)
366 *hash_out = hv;
367 return (NULL);
368 }
369
370 static dmu_buf_impl_t *
dbuf_find_bonus(objset_t * os,uint64_t object)371 dbuf_find_bonus(objset_t *os, uint64_t object)
372 {
373 dnode_t *dn;
374 dmu_buf_impl_t *db = NULL;
375
376 if (dnode_hold(os, object, FTAG, &dn) == 0) {
377 rw_enter(&dn->dn_struct_rwlock, RW_READER);
378 if (dn->dn_bonus != NULL) {
379 db = dn->dn_bonus;
380 mutex_enter(&db->db_mtx);
381 }
382 rw_exit(&dn->dn_struct_rwlock);
383 dnode_rele(dn, FTAG);
384 }
385 return (db);
386 }
387
388 /*
389 * Insert an entry into the hash table. If there is already an element
390 * equal to elem in the hash table, then the already existing element
391 * will be returned and the new element will not be inserted.
392 * Otherwise returns NULL.
393 */
394 static dmu_buf_impl_t *
dbuf_hash_insert(dmu_buf_impl_t * db)395 dbuf_hash_insert(dmu_buf_impl_t *db)
396 {
397 dbuf_hash_table_t *h = &dbuf_hash_table;
398 objset_t *os = db->db_objset;
399 uint64_t obj = db->db.db_object;
400 int level = db->db_level;
401 uint64_t blkid, idx;
402 dmu_buf_impl_t *dbf;
403 uint32_t i;
404
405 blkid = db->db_blkid;
406 ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
407 idx = db->db_hash & h->hash_table_mask;
408
409 mutex_enter(DBUF_HASH_MUTEX(h, idx));
410 for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
411 dbf = dbf->db_hash_next, i++) {
412 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
413 mutex_enter(&dbf->db_mtx);
414 if (dbf->db_state != DB_EVICTING) {
415 mutex_exit(DBUF_HASH_MUTEX(h, idx));
416 return (dbf);
417 }
418 mutex_exit(&dbf->db_mtx);
419 }
420 }
421
422 if (i > 0) {
423 DBUF_STAT_BUMP(hash_collisions);
424 if (i == 1)
425 DBUF_STAT_BUMP(hash_chains);
426
427 DBUF_STAT_MAX(hash_chain_max, i);
428 }
429
430 mutex_enter(&db->db_mtx);
431 db->db_hash_next = h->hash_table[idx];
432 h->hash_table[idx] = db;
433 mutex_exit(DBUF_HASH_MUTEX(h, idx));
434 uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
435 DBUF_STAT_MAX(hash_elements_max, he);
436
437 return (NULL);
438 }
439
440 /*
441 * This returns whether this dbuf should be stored in the metadata cache, which
442 * is based on whether it's from one of the dnode types that store data related
443 * to traversing dataset hierarchies.
444 */
445 static boolean_t
dbuf_include_in_metadata_cache(dmu_buf_impl_t * db)446 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
447 {
448 DB_DNODE_ENTER(db);
449 dmu_object_type_t type = DB_DNODE(db)->dn_type;
450 DB_DNODE_EXIT(db);
451
452 /* Check if this dbuf is one of the types we care about */
453 if (DMU_OT_IS_METADATA_CACHED(type)) {
454 /* If we hit this, then we set something up wrong in dmu_ot */
455 ASSERT(DMU_OT_IS_METADATA(type));
456
457 /*
458 * Sanity check for small-memory systems: don't allocate too
459 * much memory for this purpose.
460 */
461 if (zfs_refcount_count(
462 &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
463 dbuf_metadata_cache_target_bytes()) {
464 DBUF_STAT_BUMP(metadata_cache_overflow);
465 return (B_FALSE);
466 }
467
468 return (B_TRUE);
469 }
470
471 return (B_FALSE);
472 }
473
474 /*
475 * Remove an entry from the hash table. It must be in the EVICTING state.
476 */
477 static void
dbuf_hash_remove(dmu_buf_impl_t * db)478 dbuf_hash_remove(dmu_buf_impl_t *db)
479 {
480 dbuf_hash_table_t *h = &dbuf_hash_table;
481 uint64_t idx;
482 dmu_buf_impl_t *dbf, **dbp;
483
484 ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
485 db->db_blkid), ==, db->db_hash);
486 idx = db->db_hash & h->hash_table_mask;
487
488 /*
489 * We mustn't hold db_mtx to maintain lock ordering:
490 * DBUF_HASH_MUTEX > db_mtx.
491 */
492 ASSERT(zfs_refcount_is_zero(&db->db_holds));
493 ASSERT(db->db_state == DB_EVICTING);
494 ASSERT(!MUTEX_HELD(&db->db_mtx));
495
496 mutex_enter(DBUF_HASH_MUTEX(h, idx));
497 dbp = &h->hash_table[idx];
498 while ((dbf = *dbp) != db) {
499 dbp = &dbf->db_hash_next;
500 ASSERT(dbf != NULL);
501 }
502 *dbp = db->db_hash_next;
503 db->db_hash_next = NULL;
504 if (h->hash_table[idx] &&
505 h->hash_table[idx]->db_hash_next == NULL)
506 DBUF_STAT_BUMPDOWN(hash_chains);
507 mutex_exit(DBUF_HASH_MUTEX(h, idx));
508 atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
509 }
510
511 typedef enum {
512 DBVU_EVICTING,
513 DBVU_NOT_EVICTING
514 } dbvu_verify_type_t;
515
516 static void
dbuf_verify_user(dmu_buf_impl_t * db,dbvu_verify_type_t verify_type)517 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
518 {
519 #ifdef ZFS_DEBUG
520 int64_t holds;
521
522 if (db->db_user == NULL)
523 return;
524
525 /* Only data blocks support the attachment of user data. */
526 ASSERT(db->db_level == 0);
527
528 /* Clients must resolve a dbuf before attaching user data. */
529 ASSERT(db->db.db_data != NULL);
530 ASSERT3U(db->db_state, ==, DB_CACHED);
531
532 holds = zfs_refcount_count(&db->db_holds);
533 if (verify_type == DBVU_EVICTING) {
534 /*
535 * Immediate eviction occurs when holds == dirtycnt.
536 * For normal eviction buffers, holds is zero on
537 * eviction, except when dbuf_fix_old_data() calls
538 * dbuf_clear_data(). However, the hold count can grow
539 * during eviction even though db_mtx is held (see
540 * dmu_bonus_hold() for an example), so we can only
541 * test the generic invariant that holds >= dirtycnt.
542 */
543 ASSERT3U(holds, >=, db->db_dirtycnt);
544 } else {
545 if (db->db_user_immediate_evict == TRUE)
546 ASSERT3U(holds, >=, db->db_dirtycnt);
547 else
548 ASSERT3U(holds, >, 0);
549 }
550 #endif
551 }
552
553 static void
dbuf_evict_user(dmu_buf_impl_t * db)554 dbuf_evict_user(dmu_buf_impl_t *db)
555 {
556 dmu_buf_user_t *dbu = db->db_user;
557
558 ASSERT(MUTEX_HELD(&db->db_mtx));
559
560 if (dbu == NULL)
561 return;
562
563 dbuf_verify_user(db, DBVU_EVICTING);
564 db->db_user = NULL;
565
566 #ifdef ZFS_DEBUG
567 if (dbu->dbu_clear_on_evict_dbufp != NULL)
568 *dbu->dbu_clear_on_evict_dbufp = NULL;
569 #endif
570
571 if (db->db_caching_status != DB_NO_CACHE) {
572 /*
573 * This is a cached dbuf, so the size of the user data is
574 * included in its cached amount. We adjust it here because the
575 * user data has already been detached from the dbuf, and the
576 * sync functions are not supposed to touch it (the dbuf might
577 * not exist anymore by the time the sync functions run.
578 */
579 uint64_t size = dbu->dbu_size;
580 (void) zfs_refcount_remove_many(
581 &dbuf_caches[db->db_caching_status].size, size, db);
582 if (db->db_caching_status == DB_DBUF_CACHE)
583 DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
584 }
585
586 /*
587 * There are two eviction callbacks - one that we call synchronously
588 * and one that we invoke via a taskq. The async one is useful for
589 * avoiding lock order reversals and limiting stack depth.
590 *
591 * Note that if we have a sync callback but no async callback,
592 * it's likely that the sync callback will free the structure
593 * containing the dbu. In that case we need to take care to not
594 * dereference dbu after calling the sync evict func.
595 */
596 boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
597
598 if (dbu->dbu_evict_func_sync != NULL)
599 dbu->dbu_evict_func_sync(dbu);
600
601 if (has_async) {
602 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
603 dbu, 0, &dbu->dbu_tqent);
604 }
605 }
606
607 boolean_t
dbuf_is_metadata(dmu_buf_impl_t * db)608 dbuf_is_metadata(dmu_buf_impl_t *db)
609 {
610 /*
611 * Consider indirect blocks and spill blocks to be meta data.
612 */
613 if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
614 return (B_TRUE);
615 } else {
616 boolean_t is_metadata;
617
618 DB_DNODE_ENTER(db);
619 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
620 DB_DNODE_EXIT(db);
621
622 return (is_metadata);
623 }
624 }
625
626 /*
627 * We want to exclude buffers that are on a special allocation class from
628 * L2ARC.
629 */
630 boolean_t
dbuf_is_l2cacheable(dmu_buf_impl_t * db)631 dbuf_is_l2cacheable(dmu_buf_impl_t *db)
632 {
633 if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
634 (db->db_objset->os_secondary_cache ==
635 ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
636 if (l2arc_exclude_special == 0)
637 return (B_TRUE);
638
639 blkptr_t *bp = db->db_blkptr;
640 if (bp == NULL || BP_IS_HOLE(bp))
641 return (B_FALSE);
642 uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
643 vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
644 vdev_t *vd = NULL;
645
646 if (vdev < rvd->vdev_children)
647 vd = rvd->vdev_child[vdev];
648
649 if (vd == NULL)
650 return (B_TRUE);
651
652 if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
653 vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
654 return (B_TRUE);
655 }
656 return (B_FALSE);
657 }
658
659 static inline boolean_t
dnode_level_is_l2cacheable(blkptr_t * bp,dnode_t * dn,int64_t level)660 dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
661 {
662 if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
663 (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
664 (level > 0 ||
665 DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
666 if (l2arc_exclude_special == 0)
667 return (B_TRUE);
668
669 if (bp == NULL || BP_IS_HOLE(bp))
670 return (B_FALSE);
671 uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
672 vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
673 vdev_t *vd = NULL;
674
675 if (vdev < rvd->vdev_children)
676 vd = rvd->vdev_child[vdev];
677
678 if (vd == NULL)
679 return (B_TRUE);
680
681 if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
682 vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
683 return (B_TRUE);
684 }
685 return (B_FALSE);
686 }
687
688
689 /*
690 * This function *must* return indices evenly distributed between all
691 * sublists of the multilist. This is needed due to how the dbuf eviction
692 * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
693 * distributed between all sublists and uses this assumption when
694 * deciding which sublist to evict from and how much to evict from it.
695 */
696 static unsigned int
dbuf_cache_multilist_index_func(multilist_t * ml,void * obj)697 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
698 {
699 dmu_buf_impl_t *db = obj;
700
701 /*
702 * The assumption here, is the hash value for a given
703 * dmu_buf_impl_t will remain constant throughout it's lifetime
704 * (i.e. it's objset, object, level and blkid fields don't change).
705 * Thus, we don't need to store the dbuf's sublist index
706 * on insertion, as this index can be recalculated on removal.
707 *
708 * Also, the low order bits of the hash value are thought to be
709 * distributed evenly. Otherwise, in the case that the multilist
710 * has a power of two number of sublists, each sublists' usage
711 * would not be evenly distributed. In this context full 64bit
712 * division would be a waste of time, so limit it to 32 bits.
713 */
714 return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
715 db->db_level, db->db_blkid) %
716 multilist_get_num_sublists(ml));
717 }
718
719 /*
720 * The target size of the dbuf cache can grow with the ARC target,
721 * unless limited by the tunable dbuf_cache_max_bytes.
722 */
723 static inline unsigned long
dbuf_cache_target_bytes(void)724 dbuf_cache_target_bytes(void)
725 {
726 return (MIN(dbuf_cache_max_bytes,
727 arc_target_bytes() >> dbuf_cache_shift));
728 }
729
730 /*
731 * The target size of the dbuf metadata cache can grow with the ARC target,
732 * unless limited by the tunable dbuf_metadata_cache_max_bytes.
733 */
734 static inline unsigned long
dbuf_metadata_cache_target_bytes(void)735 dbuf_metadata_cache_target_bytes(void)
736 {
737 return (MIN(dbuf_metadata_cache_max_bytes,
738 arc_target_bytes() >> dbuf_metadata_cache_shift));
739 }
740
741 static inline uint64_t
dbuf_cache_hiwater_bytes(void)742 dbuf_cache_hiwater_bytes(void)
743 {
744 uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
745 return (dbuf_cache_target +
746 (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
747 }
748
749 static inline uint64_t
dbuf_cache_lowater_bytes(void)750 dbuf_cache_lowater_bytes(void)
751 {
752 uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
753 return (dbuf_cache_target -
754 (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
755 }
756
757 static inline boolean_t
dbuf_cache_above_lowater(void)758 dbuf_cache_above_lowater(void)
759 {
760 return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
761 dbuf_cache_lowater_bytes());
762 }
763
764 /*
765 * Evict the oldest eligible dbuf from the dbuf cache.
766 */
767 static void
dbuf_evict_one(void)768 dbuf_evict_one(void)
769 {
770 int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
771 multilist_sublist_t *mls = multilist_sublist_lock_idx(
772 &dbuf_caches[DB_DBUF_CACHE].cache, idx);
773
774 ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
775
776 dmu_buf_impl_t *db = multilist_sublist_tail(mls);
777 while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
778 db = multilist_sublist_prev(mls, db);
779 }
780
781 DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
782 multilist_sublist_t *, mls);
783
784 if (db != NULL) {
785 multilist_sublist_remove(mls, db);
786 multilist_sublist_unlock(mls);
787 uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
788 (void) zfs_refcount_remove_many(
789 &dbuf_caches[DB_DBUF_CACHE].size, size, db);
790 DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
791 DBUF_STAT_BUMPDOWN(cache_count);
792 DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
793 ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
794 db->db_caching_status = DB_NO_CACHE;
795 dbuf_destroy(db);
796 DBUF_STAT_BUMP(cache_total_evicts);
797 } else {
798 multilist_sublist_unlock(mls);
799 }
800 }
801
802 /*
803 * The dbuf evict thread is responsible for aging out dbufs from the
804 * cache. Once the cache has reached it's maximum size, dbufs are removed
805 * and destroyed. The eviction thread will continue running until the size
806 * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
807 * out of the cache it is destroyed and becomes eligible for arc eviction.
808 */
809 static __attribute__((noreturn)) void
dbuf_evict_thread(void * unused)810 dbuf_evict_thread(void *unused)
811 {
812 (void) unused;
813 callb_cpr_t cpr;
814
815 CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
816
817 mutex_enter(&dbuf_evict_lock);
818 while (!dbuf_evict_thread_exit) {
819 while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
820 CALLB_CPR_SAFE_BEGIN(&cpr);
821 (void) cv_timedwait_idle_hires(&dbuf_evict_cv,
822 &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
823 CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
824 }
825 mutex_exit(&dbuf_evict_lock);
826
827 /*
828 * Keep evicting as long as we're above the low water mark
829 * for the cache. We do this without holding the locks to
830 * minimize lock contention.
831 */
832 while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
833 dbuf_evict_one();
834 }
835
836 mutex_enter(&dbuf_evict_lock);
837 }
838
839 dbuf_evict_thread_exit = B_FALSE;
840 cv_broadcast(&dbuf_evict_cv);
841 CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
842 thread_exit();
843 }
844
845 /*
846 * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
847 * If the dbuf cache is at its high water mark, then evict a dbuf from the
848 * dbuf cache using the caller's context.
849 */
850 static void
dbuf_evict_notify(uint64_t size)851 dbuf_evict_notify(uint64_t size)
852 {
853 /*
854 * We check if we should evict without holding the dbuf_evict_lock,
855 * because it's OK to occasionally make the wrong decision here,
856 * and grabbing the lock results in massive lock contention.
857 */
858 if (size > dbuf_cache_target_bytes()) {
859 if (size > dbuf_cache_hiwater_bytes())
860 dbuf_evict_one();
861 cv_signal(&dbuf_evict_cv);
862 }
863 }
864
865 static int
dbuf_kstat_update(kstat_t * ksp,int rw)866 dbuf_kstat_update(kstat_t *ksp, int rw)
867 {
868 dbuf_stats_t *ds = ksp->ks_data;
869 dbuf_hash_table_t *h = &dbuf_hash_table;
870
871 if (rw == KSTAT_WRITE)
872 return (SET_ERROR(EACCES));
873
874 ds->cache_count.value.ui64 =
875 wmsum_value(&dbuf_sums.cache_count);
876 ds->cache_size_bytes.value.ui64 =
877 zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
878 ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
879 ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
880 ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
881 ds->cache_total_evicts.value.ui64 =
882 wmsum_value(&dbuf_sums.cache_total_evicts);
883 for (int i = 0; i < DN_MAX_LEVELS; i++) {
884 ds->cache_levels[i].value.ui64 =
885 wmsum_value(&dbuf_sums.cache_levels[i]);
886 ds->cache_levels_bytes[i].value.ui64 =
887 wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
888 }
889 ds->hash_hits.value.ui64 =
890 wmsum_value(&dbuf_sums.hash_hits);
891 ds->hash_misses.value.ui64 =
892 wmsum_value(&dbuf_sums.hash_misses);
893 ds->hash_collisions.value.ui64 =
894 wmsum_value(&dbuf_sums.hash_collisions);
895 ds->hash_chains.value.ui64 =
896 wmsum_value(&dbuf_sums.hash_chains);
897 ds->hash_insert_race.value.ui64 =
898 wmsum_value(&dbuf_sums.hash_insert_race);
899 ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
900 ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
901 ds->metadata_cache_count.value.ui64 =
902 wmsum_value(&dbuf_sums.metadata_cache_count);
903 ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
904 &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
905 ds->metadata_cache_overflow.value.ui64 =
906 wmsum_value(&dbuf_sums.metadata_cache_overflow);
907 return (0);
908 }
909
910 void
dbuf_init(void)911 dbuf_init(void)
912 {
913 uint64_t hmsize, hsize = 1ULL << 16;
914 dbuf_hash_table_t *h = &dbuf_hash_table;
915
916 /*
917 * The hash table is big enough to fill one eighth of physical memory
918 * with an average block size of zfs_arc_average_blocksize (default 8K).
919 * By default, the table will take up
920 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
921 */
922 while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
923 hsize <<= 1;
924
925 h->hash_table = NULL;
926 while (h->hash_table == NULL) {
927 h->hash_table_mask = hsize - 1;
928
929 h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
930 if (h->hash_table == NULL)
931 hsize >>= 1;
932
933 ASSERT3U(hsize, >=, 1ULL << 10);
934 }
935
936 /*
937 * The hash table buckets are protected by an array of mutexes where
938 * each mutex is reponsible for protecting 128 buckets. A minimum
939 * array size of 8192 is targeted to avoid contention.
940 */
941 if (dbuf_mutex_cache_shift == 0)
942 hmsize = MAX(hsize >> 7, 1ULL << 13);
943 else
944 hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
945
946 h->hash_mutexes = NULL;
947 while (h->hash_mutexes == NULL) {
948 h->hash_mutex_mask = hmsize - 1;
949
950 h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
951 KM_SLEEP);
952 if (h->hash_mutexes == NULL)
953 hmsize >>= 1;
954 }
955
956 dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
957 sizeof (dmu_buf_impl_t),
958 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
959
960 for (int i = 0; i < hmsize; i++)
961 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
962
963 dbuf_stats_init(h);
964
965 /*
966 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
967 * configuration is not required.
968 */
969 dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
970
971 for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
972 multilist_create(&dbuf_caches[dcs].cache,
973 sizeof (dmu_buf_impl_t),
974 offsetof(dmu_buf_impl_t, db_cache_link),
975 dbuf_cache_multilist_index_func);
976 zfs_refcount_create(&dbuf_caches[dcs].size);
977 }
978
979 dbuf_evict_thread_exit = B_FALSE;
980 mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
981 cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
982 dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
983 NULL, 0, &p0, TS_RUN, minclsyspri);
984
985 wmsum_init(&dbuf_sums.cache_count, 0);
986 wmsum_init(&dbuf_sums.cache_total_evicts, 0);
987 for (int i = 0; i < DN_MAX_LEVELS; i++) {
988 wmsum_init(&dbuf_sums.cache_levels[i], 0);
989 wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
990 }
991 wmsum_init(&dbuf_sums.hash_hits, 0);
992 wmsum_init(&dbuf_sums.hash_misses, 0);
993 wmsum_init(&dbuf_sums.hash_collisions, 0);
994 wmsum_init(&dbuf_sums.hash_chains, 0);
995 wmsum_init(&dbuf_sums.hash_insert_race, 0);
996 wmsum_init(&dbuf_sums.metadata_cache_count, 0);
997 wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
998
999 dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
1000 KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
1001 KSTAT_FLAG_VIRTUAL);
1002 if (dbuf_ksp != NULL) {
1003 for (int i = 0; i < DN_MAX_LEVELS; i++) {
1004 snprintf(dbuf_stats.cache_levels[i].name,
1005 KSTAT_STRLEN, "cache_level_%d", i);
1006 dbuf_stats.cache_levels[i].data_type =
1007 KSTAT_DATA_UINT64;
1008 snprintf(dbuf_stats.cache_levels_bytes[i].name,
1009 KSTAT_STRLEN, "cache_level_%d_bytes", i);
1010 dbuf_stats.cache_levels_bytes[i].data_type =
1011 KSTAT_DATA_UINT64;
1012 }
1013 dbuf_ksp->ks_data = &dbuf_stats;
1014 dbuf_ksp->ks_update = dbuf_kstat_update;
1015 kstat_install(dbuf_ksp);
1016 }
1017 }
1018
1019 void
dbuf_fini(void)1020 dbuf_fini(void)
1021 {
1022 dbuf_hash_table_t *h = &dbuf_hash_table;
1023
1024 dbuf_stats_destroy();
1025
1026 for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
1027 mutex_destroy(&h->hash_mutexes[i]);
1028
1029 vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
1030 vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
1031 sizeof (kmutex_t));
1032
1033 kmem_cache_destroy(dbuf_kmem_cache);
1034 taskq_destroy(dbu_evict_taskq);
1035
1036 mutex_enter(&dbuf_evict_lock);
1037 dbuf_evict_thread_exit = B_TRUE;
1038 while (dbuf_evict_thread_exit) {
1039 cv_signal(&dbuf_evict_cv);
1040 cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
1041 }
1042 mutex_exit(&dbuf_evict_lock);
1043
1044 mutex_destroy(&dbuf_evict_lock);
1045 cv_destroy(&dbuf_evict_cv);
1046
1047 for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
1048 zfs_refcount_destroy(&dbuf_caches[dcs].size);
1049 multilist_destroy(&dbuf_caches[dcs].cache);
1050 }
1051
1052 if (dbuf_ksp != NULL) {
1053 kstat_delete(dbuf_ksp);
1054 dbuf_ksp = NULL;
1055 }
1056
1057 wmsum_fini(&dbuf_sums.cache_count);
1058 wmsum_fini(&dbuf_sums.cache_total_evicts);
1059 for (int i = 0; i < DN_MAX_LEVELS; i++) {
1060 wmsum_fini(&dbuf_sums.cache_levels[i]);
1061 wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
1062 }
1063 wmsum_fini(&dbuf_sums.hash_hits);
1064 wmsum_fini(&dbuf_sums.hash_misses);
1065 wmsum_fini(&dbuf_sums.hash_collisions);
1066 wmsum_fini(&dbuf_sums.hash_chains);
1067 wmsum_fini(&dbuf_sums.hash_insert_race);
1068 wmsum_fini(&dbuf_sums.metadata_cache_count);
1069 wmsum_fini(&dbuf_sums.metadata_cache_overflow);
1070 }
1071
1072 /*
1073 * Other stuff.
1074 */
1075
1076 #ifdef ZFS_DEBUG
1077 static void
dbuf_verify(dmu_buf_impl_t * db)1078 dbuf_verify(dmu_buf_impl_t *db)
1079 {
1080 dnode_t *dn;
1081 dbuf_dirty_record_t *dr;
1082 uint32_t txg_prev;
1083
1084 ASSERT(MUTEX_HELD(&db->db_mtx));
1085
1086 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
1087 return;
1088
1089 ASSERT(db->db_objset != NULL);
1090 DB_DNODE_ENTER(db);
1091 dn = DB_DNODE(db);
1092 if (dn == NULL) {
1093 ASSERT(db->db_parent == NULL);
1094 ASSERT(db->db_blkptr == NULL);
1095 } else {
1096 ASSERT3U(db->db.db_object, ==, dn->dn_object);
1097 ASSERT3P(db->db_objset, ==, dn->dn_objset);
1098 ASSERT3U(db->db_level, <, dn->dn_nlevels);
1099 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
1100 db->db_blkid == DMU_SPILL_BLKID ||
1101 !avl_is_empty(&dn->dn_dbufs));
1102 }
1103 if (db->db_blkid == DMU_BONUS_BLKID) {
1104 ASSERT(dn != NULL);
1105 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1106 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
1107 } else if (db->db_blkid == DMU_SPILL_BLKID) {
1108 ASSERT(dn != NULL);
1109 ASSERT0(db->db.db_offset);
1110 } else {
1111 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
1112 }
1113
1114 if ((dr = list_head(&db->db_dirty_records)) != NULL) {
1115 ASSERT(dr->dr_dbuf == db);
1116 txg_prev = dr->dr_txg;
1117 for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
1118 dr = list_next(&db->db_dirty_records, dr)) {
1119 ASSERT(dr->dr_dbuf == db);
1120 ASSERT(txg_prev > dr->dr_txg);
1121 txg_prev = dr->dr_txg;
1122 }
1123 }
1124
1125 /*
1126 * We can't assert that db_size matches dn_datablksz because it
1127 * can be momentarily different when another thread is doing
1128 * dnode_set_blksz().
1129 */
1130 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
1131 dr = db->db_data_pending;
1132 /*
1133 * It should only be modified in syncing context, so
1134 * make sure we only have one copy of the data.
1135 */
1136 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
1137 }
1138
1139 /* verify db->db_blkptr */
1140 if (db->db_blkptr) {
1141 if (db->db_parent == dn->dn_dbuf) {
1142 /* db is pointed to by the dnode */
1143 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
1144 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
1145 ASSERT(db->db_parent == NULL);
1146 else
1147 ASSERT(db->db_parent != NULL);
1148 if (db->db_blkid != DMU_SPILL_BLKID)
1149 ASSERT3P(db->db_blkptr, ==,
1150 &dn->dn_phys->dn_blkptr[db->db_blkid]);
1151 } else {
1152 /* db is pointed to by an indirect block */
1153 int epb __maybe_unused = db->db_parent->db.db_size >>
1154 SPA_BLKPTRSHIFT;
1155 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
1156 ASSERT3U(db->db_parent->db.db_object, ==,
1157 db->db.db_object);
1158 /*
1159 * dnode_grow_indblksz() can make this fail if we don't
1160 * have the parent's rwlock. XXX indblksz no longer
1161 * grows. safe to do this now?
1162 */
1163 if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
1164 ASSERT3P(db->db_blkptr, ==,
1165 ((blkptr_t *)db->db_parent->db.db_data +
1166 db->db_blkid % epb));
1167 }
1168 }
1169 }
1170 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
1171 (db->db_buf == NULL || db->db_buf->b_data) &&
1172 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
1173 db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
1174 /*
1175 * If the blkptr isn't set but they have nonzero data,
1176 * it had better be dirty, otherwise we'll lose that
1177 * data when we evict this buffer.
1178 *
1179 * There is an exception to this rule for indirect blocks; in
1180 * this case, if the indirect block is a hole, we fill in a few
1181 * fields on each of the child blocks (importantly, birth time)
1182 * to prevent hole birth times from being lost when you
1183 * partially fill in a hole.
1184 */
1185 if (db->db_dirtycnt == 0) {
1186 if (db->db_level == 0) {
1187 uint64_t *buf = db->db.db_data;
1188 int i;
1189
1190 for (i = 0; i < db->db.db_size >> 3; i++) {
1191 ASSERT(buf[i] == 0);
1192 }
1193 } else {
1194 blkptr_t *bps = db->db.db_data;
1195 ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
1196 db->db.db_size);
1197 /*
1198 * We want to verify that all the blkptrs in the
1199 * indirect block are holes, but we may have
1200 * automatically set up a few fields for them.
1201 * We iterate through each blkptr and verify
1202 * they only have those fields set.
1203 */
1204 for (int i = 0;
1205 i < db->db.db_size / sizeof (blkptr_t);
1206 i++) {
1207 blkptr_t *bp = &bps[i];
1208 ASSERT(ZIO_CHECKSUM_IS_ZERO(
1209 &bp->blk_cksum));
1210 ASSERT(
1211 DVA_IS_EMPTY(&bp->blk_dva[0]) &&
1212 DVA_IS_EMPTY(&bp->blk_dva[1]) &&
1213 DVA_IS_EMPTY(&bp->blk_dva[2]));
1214 ASSERT0(bp->blk_fill);
1215 ASSERT0(bp->blk_pad[0]);
1216 ASSERT0(bp->blk_pad[1]);
1217 ASSERT(!BP_IS_EMBEDDED(bp));
1218 ASSERT(BP_IS_HOLE(bp));
1219 ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
1220 }
1221 }
1222 }
1223 }
1224 DB_DNODE_EXIT(db);
1225 }
1226 #endif
1227
1228 static void
dbuf_clear_data(dmu_buf_impl_t * db)1229 dbuf_clear_data(dmu_buf_impl_t *db)
1230 {
1231 ASSERT(MUTEX_HELD(&db->db_mtx));
1232 dbuf_evict_user(db);
1233 ASSERT3P(db->db_buf, ==, NULL);
1234 db->db.db_data = NULL;
1235 if (db->db_state != DB_NOFILL) {
1236 db->db_state = DB_UNCACHED;
1237 DTRACE_SET_STATE(db, "clear data");
1238 }
1239 }
1240
1241 static void
dbuf_set_data(dmu_buf_impl_t * db,arc_buf_t * buf)1242 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
1243 {
1244 ASSERT(MUTEX_HELD(&db->db_mtx));
1245 ASSERT(buf != NULL);
1246
1247 db->db_buf = buf;
1248 ASSERT(buf->b_data != NULL);
1249 db->db.db_data = buf->b_data;
1250 }
1251
1252 static arc_buf_t *
dbuf_alloc_arcbuf(dmu_buf_impl_t * db)1253 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
1254 {
1255 spa_t *spa = db->db_objset->os_spa;
1256
1257 return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
1258 }
1259
1260 /*
1261 * Loan out an arc_buf for read. Return the loaned arc_buf.
1262 */
1263 arc_buf_t *
dbuf_loan_arcbuf(dmu_buf_impl_t * db)1264 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
1265 {
1266 arc_buf_t *abuf;
1267
1268 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1269 mutex_enter(&db->db_mtx);
1270 if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
1271 int blksz = db->db.db_size;
1272 spa_t *spa = db->db_objset->os_spa;
1273
1274 mutex_exit(&db->db_mtx);
1275 abuf = arc_loan_buf(spa, B_FALSE, blksz);
1276 memcpy(abuf->b_data, db->db.db_data, blksz);
1277 } else {
1278 abuf = db->db_buf;
1279 arc_loan_inuse_buf(abuf, db);
1280 db->db_buf = NULL;
1281 dbuf_clear_data(db);
1282 mutex_exit(&db->db_mtx);
1283 }
1284 return (abuf);
1285 }
1286
1287 /*
1288 * Calculate which level n block references the data at the level 0 offset
1289 * provided.
1290 */
1291 uint64_t
dbuf_whichblock(const dnode_t * dn,const int64_t level,const uint64_t offset)1292 dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
1293 {
1294 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
1295 /*
1296 * The level n blkid is equal to the level 0 blkid divided by
1297 * the number of level 0s in a level n block.
1298 *
1299 * The level 0 blkid is offset >> datablkshift =
1300 * offset / 2^datablkshift.
1301 *
1302 * The number of level 0s in a level n is the number of block
1303 * pointers in an indirect block, raised to the power of level.
1304 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
1305 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
1306 *
1307 * Thus, the level n blkid is: offset /
1308 * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
1309 * = offset / 2^(datablkshift + level *
1310 * (indblkshift - SPA_BLKPTRSHIFT))
1311 * = offset >> (datablkshift + level *
1312 * (indblkshift - SPA_BLKPTRSHIFT))
1313 */
1314
1315 const unsigned exp = dn->dn_datablkshift +
1316 level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
1317
1318 if (exp >= 8 * sizeof (offset)) {
1319 /* This only happens on the highest indirection level */
1320 ASSERT3U(level, ==, dn->dn_nlevels - 1);
1321 return (0);
1322 }
1323
1324 ASSERT3U(exp, <, 8 * sizeof (offset));
1325
1326 return (offset >> exp);
1327 } else {
1328 ASSERT3U(offset, <, dn->dn_datablksz);
1329 return (0);
1330 }
1331 }
1332
1333 /*
1334 * This function is used to lock the parent of the provided dbuf. This should be
1335 * used when modifying or reading db_blkptr.
1336 */
1337 db_lock_type_t
dmu_buf_lock_parent(dmu_buf_impl_t * db,krw_t rw,const void * tag)1338 dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
1339 {
1340 enum db_lock_type ret = DLT_NONE;
1341 if (db->db_parent != NULL) {
1342 rw_enter(&db->db_parent->db_rwlock, rw);
1343 ret = DLT_PARENT;
1344 } else if (dmu_objset_ds(db->db_objset) != NULL) {
1345 rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
1346 tag);
1347 ret = DLT_OBJSET;
1348 }
1349 /*
1350 * We only return a DLT_NONE lock when it's the top-most indirect block
1351 * of the meta-dnode of the MOS.
1352 */
1353 return (ret);
1354 }
1355
1356 /*
1357 * We need to pass the lock type in because it's possible that the block will
1358 * move from being the topmost indirect block in a dnode (and thus, have no
1359 * parent) to not the top-most via an indirection increase. This would cause a
1360 * panic if we didn't pass the lock type in.
1361 */
1362 void
dmu_buf_unlock_parent(dmu_buf_impl_t * db,db_lock_type_t type,const void * tag)1363 dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
1364 {
1365 if (type == DLT_PARENT)
1366 rw_exit(&db->db_parent->db_rwlock);
1367 else if (type == DLT_OBJSET)
1368 rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
1369 }
1370
1371 static void
dbuf_read_done(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * bp,arc_buf_t * buf,void * vdb)1372 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
1373 arc_buf_t *buf, void *vdb)
1374 {
1375 (void) zb, (void) bp;
1376 dmu_buf_impl_t *db = vdb;
1377
1378 mutex_enter(&db->db_mtx);
1379 ASSERT3U(db->db_state, ==, DB_READ);
1380 /*
1381 * All reads are synchronous, so we must have a hold on the dbuf
1382 */
1383 ASSERT(zfs_refcount_count(&db->db_holds) > 0);
1384 ASSERT(db->db_buf == NULL);
1385 ASSERT(db->db.db_data == NULL);
1386 if (buf == NULL) {
1387 /* i/o error */
1388 ASSERT(zio == NULL || zio->io_error != 0);
1389 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1390 ASSERT3P(db->db_buf, ==, NULL);
1391 db->db_state = DB_UNCACHED;
1392 DTRACE_SET_STATE(db, "i/o error");
1393 } else if (db->db_level == 0 && db->db_freed_in_flight) {
1394 /* freed in flight */
1395 ASSERT(zio == NULL || zio->io_error == 0);
1396 arc_release(buf, db);
1397 memset(buf->b_data, 0, db->db.db_size);
1398 arc_buf_freeze(buf);
1399 db->db_freed_in_flight = FALSE;
1400 dbuf_set_data(db, buf);
1401 db->db_state = DB_CACHED;
1402 DTRACE_SET_STATE(db, "freed in flight");
1403 } else {
1404 /* success */
1405 ASSERT(zio == NULL || zio->io_error == 0);
1406 dbuf_set_data(db, buf);
1407 db->db_state = DB_CACHED;
1408 DTRACE_SET_STATE(db, "successful read");
1409 }
1410 cv_broadcast(&db->db_changed);
1411 dbuf_rele_and_unlock(db, NULL, B_FALSE);
1412 }
1413
1414 /*
1415 * Shortcut for performing reads on bonus dbufs. Returns
1416 * an error if we fail to verify the dnode associated with
1417 * a decrypted block. Otherwise success.
1418 */
1419 static int
dbuf_read_bonus(dmu_buf_impl_t * db,dnode_t * dn)1420 dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
1421 {
1422 int bonuslen, max_bonuslen;
1423
1424 bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
1425 max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1426 ASSERT(MUTEX_HELD(&db->db_mtx));
1427 ASSERT(DB_DNODE_HELD(db));
1428 ASSERT3U(bonuslen, <=, db->db.db_size);
1429 db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
1430 arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
1431 if (bonuslen < max_bonuslen)
1432 memset(db->db.db_data, 0, max_bonuslen);
1433 if (bonuslen)
1434 memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
1435 db->db_state = DB_CACHED;
1436 DTRACE_SET_STATE(db, "bonus buffer filled");
1437 return (0);
1438 }
1439
1440 static void
dbuf_handle_indirect_hole(dmu_buf_impl_t * db,dnode_t * dn,blkptr_t * dbbp)1441 dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
1442 {
1443 blkptr_t *bps = db->db.db_data;
1444 uint32_t indbs = 1ULL << dn->dn_indblkshift;
1445 int n_bps = indbs >> SPA_BLKPTRSHIFT;
1446
1447 for (int i = 0; i < n_bps; i++) {
1448 blkptr_t *bp = &bps[i];
1449
1450 ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
1451 BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
1452 dn->dn_datablksz : BP_GET_LSIZE(dbbp));
1453 BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
1454 BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
1455 BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
1456 }
1457 }
1458
1459 /*
1460 * Handle reads on dbufs that are holes, if necessary. This function
1461 * requires that the dbuf's mutex is held. Returns success (0) if action
1462 * was taken, ENOENT if no action was taken.
1463 */
1464 static int
dbuf_read_hole(dmu_buf_impl_t * db,dnode_t * dn,blkptr_t * bp)1465 dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
1466 {
1467 ASSERT(MUTEX_HELD(&db->db_mtx));
1468
1469 int is_hole = bp == NULL || BP_IS_HOLE(bp);
1470 /*
1471 * For level 0 blocks only, if the above check fails:
1472 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
1473 * processes the delete record and clears the bp while we are waiting
1474 * for the dn_mtx (resulting in a "no" from block_freed).
1475 */
1476 if (!is_hole && db->db_level == 0)
1477 is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
1478
1479 if (is_hole) {
1480 dbuf_set_data(db, dbuf_alloc_arcbuf(db));
1481 memset(db->db.db_data, 0, db->db.db_size);
1482
1483 if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
1484 BP_GET_LOGICAL_BIRTH(bp) != 0) {
1485 dbuf_handle_indirect_hole(db, dn, bp);
1486 }
1487 db->db_state = DB_CACHED;
1488 DTRACE_SET_STATE(db, "hole read satisfied");
1489 return (0);
1490 }
1491 return (ENOENT);
1492 }
1493
1494 /*
1495 * This function ensures that, when doing a decrypting read of a block,
1496 * we make sure we have decrypted the dnode associated with it. We must do
1497 * this so that we ensure we are fully authenticating the checksum-of-MACs
1498 * tree from the root of the objset down to this block. Indirect blocks are
1499 * always verified against their secure checksum-of-MACs assuming that the
1500 * dnode containing them is correct. Now that we are doing a decrypting read,
1501 * we can be sure that the key is loaded and verify that assumption. This is
1502 * especially important considering that we always read encrypted dnode
1503 * blocks as raw data (without verifying their MACs) to start, and
1504 * decrypt / authenticate them when we need to read an encrypted bonus buffer.
1505 */
1506 static int
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t * db,dnode_t * dn,uint32_t flags)1507 dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
1508 {
1509 objset_t *os = db->db_objset;
1510 dmu_buf_impl_t *dndb;
1511 arc_buf_t *dnbuf;
1512 zbookmark_phys_t zb;
1513 int err;
1514
1515 if ((flags & DB_RF_NO_DECRYPT) != 0 ||
1516 !os->os_encrypted || os->os_raw_receive ||
1517 (dndb = dn->dn_dbuf) == NULL)
1518 return (0);
1519
1520 dnbuf = dndb->db_buf;
1521 if (!arc_is_encrypted(dnbuf))
1522 return (0);
1523
1524 mutex_enter(&dndb->db_mtx);
1525
1526 /*
1527 * Since dnode buffer is modified by sync process, there can be only
1528 * one copy of it. It means we can not modify (decrypt) it while it
1529 * is being written. I don't see how this may happen now, since
1530 * encrypted dnode writes by receive should be completed before any
1531 * plain-text reads due to txg wait, but better be safe than sorry.
1532 */
1533 while (1) {
1534 if (!arc_is_encrypted(dnbuf)) {
1535 mutex_exit(&dndb->db_mtx);
1536 return (0);
1537 }
1538 dbuf_dirty_record_t *dr = dndb->db_data_pending;
1539 if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
1540 break;
1541 cv_wait(&dndb->db_changed, &dndb->db_mtx);
1542 };
1543
1544 SET_BOOKMARK(&zb, dmu_objset_id(os),
1545 DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
1546 err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
1547
1548 /*
1549 * An error code of EACCES tells us that the key is still not
1550 * available. This is ok if we are only reading authenticated
1551 * (and therefore non-encrypted) blocks.
1552 */
1553 if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
1554 !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
1555 (db->db_blkid == DMU_BONUS_BLKID &&
1556 !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
1557 err = 0;
1558
1559 mutex_exit(&dndb->db_mtx);
1560
1561 return (err);
1562 }
1563
1564 /*
1565 * Drops db_mtx and the parent lock specified by dblt and tag before
1566 * returning.
1567 */
1568 static int
dbuf_read_impl(dmu_buf_impl_t * db,dnode_t * dn,zio_t * zio,uint32_t flags,db_lock_type_t dblt,const void * tag)1569 dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
1570 db_lock_type_t dblt, const void *tag)
1571 {
1572 zbookmark_phys_t zb;
1573 uint32_t aflags = ARC_FLAG_NOWAIT;
1574 int err, zio_flags;
1575 blkptr_t bp, *bpp = NULL;
1576
1577 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1578 ASSERT(MUTEX_HELD(&db->db_mtx));
1579 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1580 ASSERT(db->db_buf == NULL);
1581 ASSERT(db->db_parent == NULL ||
1582 RW_LOCK_HELD(&db->db_parent->db_rwlock));
1583
1584 if (db->db_blkid == DMU_BONUS_BLKID) {
1585 err = dbuf_read_bonus(db, dn);
1586 goto early_unlock;
1587 }
1588
1589 /*
1590 * If we have a pending block clone, we don't want to read the
1591 * underlying block, but the content of the block being cloned,
1592 * pointed by the dirty record, so we have the most recent data.
1593 * If there is no dirty record, then we hit a race in a sync
1594 * process when the dirty record is already removed, while the
1595 * dbuf is not yet destroyed. Such case is equivalent to uncached.
1596 */
1597 if (db->db_state == DB_NOFILL) {
1598 dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
1599 if (dr != NULL) {
1600 if (!dr->dt.dl.dr_brtwrite) {
1601 err = EIO;
1602 goto early_unlock;
1603 }
1604 bp = dr->dt.dl.dr_overridden_by;
1605 bpp = &bp;
1606 }
1607 }
1608
1609 if (bpp == NULL && db->db_blkptr != NULL) {
1610 bp = *db->db_blkptr;
1611 bpp = &bp;
1612 }
1613
1614 err = dbuf_read_hole(db, dn, bpp);
1615 if (err == 0)
1616 goto early_unlock;
1617
1618 ASSERT(bpp != NULL);
1619
1620 /*
1621 * Any attempt to read a redacted block should result in an error. This
1622 * will never happen under normal conditions, but can be useful for
1623 * debugging purposes.
1624 */
1625 if (BP_IS_REDACTED(bpp)) {
1626 ASSERT(dsl_dataset_feature_is_active(
1627 db->db_objset->os_dsl_dataset,
1628 SPA_FEATURE_REDACTED_DATASETS));
1629 err = SET_ERROR(EIO);
1630 goto early_unlock;
1631 }
1632
1633 SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1634 db->db.db_object, db->db_level, db->db_blkid);
1635
1636 /*
1637 * All bps of an encrypted os should have the encryption bit set.
1638 * If this is not true it indicates tampering and we report an error.
1639 */
1640 if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
1641 spa_log_error(db->db_objset->os_spa, &zb,
1642 BP_GET_LOGICAL_BIRTH(bpp));
1643 err = SET_ERROR(EIO);
1644 goto early_unlock;
1645 }
1646
1647 db->db_state = DB_READ;
1648 DTRACE_SET_STATE(db, "read issued");
1649 mutex_exit(&db->db_mtx);
1650
1651 if (!DBUF_IS_CACHEABLE(db))
1652 aflags |= ARC_FLAG_UNCACHED;
1653 else if (dbuf_is_l2cacheable(db))
1654 aflags |= ARC_FLAG_L2CACHE;
1655
1656 dbuf_add_ref(db, NULL);
1657
1658 zio_flags = (flags & DB_RF_CANFAIL) ?
1659 ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
1660
1661 if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
1662 zio_flags |= ZIO_FLAG_RAW;
1663 /*
1664 * The zio layer will copy the provided blkptr later, but we have our
1665 * own copy so that we can release the parent's rwlock. We have to
1666 * do that so that if dbuf_read_done is called synchronously (on
1667 * an l1 cache hit) we don't acquire the db_mtx while holding the
1668 * parent's rwlock, which would be a lock ordering violation.
1669 */
1670 dmu_buf_unlock_parent(db, dblt, tag);
1671 return (arc_read(zio, db->db_objset->os_spa, bpp,
1672 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
1673 &aflags, &zb));
1674
1675 early_unlock:
1676 mutex_exit(&db->db_mtx);
1677 dmu_buf_unlock_parent(db, dblt, tag);
1678 return (err);
1679 }
1680
1681 /*
1682 * This is our just-in-time copy function. It makes a copy of buffers that
1683 * have been modified in a previous transaction group before we access them in
1684 * the current active group.
1685 *
1686 * This function is used in three places: when we are dirtying a buffer for the
1687 * first time in a txg, when we are freeing a range in a dnode that includes
1688 * this buffer, and when we are accessing a buffer which was received compressed
1689 * and later referenced in a WRITE_BYREF record.
1690 *
1691 * Note that when we are called from dbuf_free_range() we do not put a hold on
1692 * the buffer, we just traverse the active dbuf list for the dnode.
1693 */
1694 static void
dbuf_fix_old_data(dmu_buf_impl_t * db,uint64_t txg)1695 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
1696 {
1697 dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
1698
1699 ASSERT(MUTEX_HELD(&db->db_mtx));
1700 ASSERT(db->db.db_data != NULL);
1701 ASSERT(db->db_level == 0);
1702 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
1703
1704 if (dr == NULL ||
1705 (dr->dt.dl.dr_data !=
1706 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
1707 return;
1708
1709 /*
1710 * If the last dirty record for this dbuf has not yet synced
1711 * and its referencing the dbuf data, either:
1712 * reset the reference to point to a new copy,
1713 * or (if there a no active holders)
1714 * just null out the current db_data pointer.
1715 */
1716 ASSERT3U(dr->dr_txg, >=, txg - 2);
1717 if (db->db_blkid == DMU_BONUS_BLKID) {
1718 dnode_t *dn = DB_DNODE(db);
1719 int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1720 dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
1721 arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1722 memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
1723 } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
1724 dnode_t *dn = DB_DNODE(db);
1725 int size = arc_buf_size(db->db_buf);
1726 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1727 spa_t *spa = db->db_objset->os_spa;
1728 enum zio_compress compress_type =
1729 arc_get_compression(db->db_buf);
1730 uint8_t complevel = arc_get_complevel(db->db_buf);
1731
1732 if (arc_is_encrypted(db->db_buf)) {
1733 boolean_t byteorder;
1734 uint8_t salt[ZIO_DATA_SALT_LEN];
1735 uint8_t iv[ZIO_DATA_IV_LEN];
1736 uint8_t mac[ZIO_DATA_MAC_LEN];
1737
1738 arc_get_raw_params(db->db_buf, &byteorder, salt,
1739 iv, mac);
1740 dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
1741 dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
1742 mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
1743 compress_type, complevel);
1744 } else if (compress_type != ZIO_COMPRESS_OFF) {
1745 ASSERT3U(type, ==, ARC_BUFC_DATA);
1746 dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
1747 size, arc_buf_lsize(db->db_buf), compress_type,
1748 complevel);
1749 } else {
1750 dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
1751 }
1752 memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
1753 } else {
1754 db->db_buf = NULL;
1755 dbuf_clear_data(db);
1756 }
1757 }
1758
1759 int
dbuf_read(dmu_buf_impl_t * db,zio_t * pio,uint32_t flags)1760 dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
1761 {
1762 dnode_t *dn;
1763 boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
1764 int err;
1765
1766 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1767
1768 DB_DNODE_ENTER(db);
1769 dn = DB_DNODE(db);
1770
1771 /*
1772 * Ensure that this block's dnode has been decrypted if the caller
1773 * has requested decrypted data.
1774 */
1775 err = dbuf_read_verify_dnode_crypt(db, dn, flags);
1776 if (err != 0)
1777 goto done;
1778
1779 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1780 (flags & DB_RF_NOPREFETCH) == 0;
1781
1782 mutex_enter(&db->db_mtx);
1783 if (flags & DB_RF_PARTIAL_FIRST)
1784 db->db_partial_read = B_TRUE;
1785 else if (!(flags & DB_RF_PARTIAL_MORE))
1786 db->db_partial_read = B_FALSE;
1787 miss = (db->db_state != DB_CACHED);
1788
1789 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
1790 /*
1791 * Another reader came in while the dbuf was in flight between
1792 * UNCACHED and CACHED. Either a writer will finish filling
1793 * the buffer, sending the dbuf to CACHED, or the first reader's
1794 * request will reach the read_done callback and send the dbuf
1795 * to CACHED. Otherwise, a failure occurred and the dbuf will
1796 * be sent to UNCACHED.
1797 */
1798 if (flags & DB_RF_NEVERWAIT) {
1799 mutex_exit(&db->db_mtx);
1800 DB_DNODE_EXIT(db);
1801 goto done;
1802 }
1803 do {
1804 ASSERT(db->db_state == DB_READ ||
1805 (flags & DB_RF_HAVESTRUCT) == 0);
1806 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
1807 zio_t *, pio);
1808 cv_wait(&db->db_changed, &db->db_mtx);
1809 } while (db->db_state == DB_READ || db->db_state == DB_FILL);
1810 if (db->db_state == DB_UNCACHED) {
1811 err = SET_ERROR(EIO);
1812 mutex_exit(&db->db_mtx);
1813 DB_DNODE_EXIT(db);
1814 goto done;
1815 }
1816 }
1817
1818 if (db->db_state == DB_CACHED) {
1819 /*
1820 * If the arc buf is compressed or encrypted and the caller
1821 * requested uncompressed data, we need to untransform it
1822 * before returning. We also call arc_untransform() on any
1823 * unauthenticated blocks, which will verify their MAC if
1824 * the key is now available.
1825 */
1826 if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
1827 (arc_is_encrypted(db->db_buf) ||
1828 arc_is_unauthenticated(db->db_buf) ||
1829 arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
1830 spa_t *spa = dn->dn_objset->os_spa;
1831 zbookmark_phys_t zb;
1832
1833 SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1834 db->db.db_object, db->db_level, db->db_blkid);
1835 dbuf_fix_old_data(db, spa_syncing_txg(spa));
1836 err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
1837 dbuf_set_data(db, db->db_buf);
1838 }
1839 mutex_exit(&db->db_mtx);
1840 } else {
1841 ASSERT(db->db_state == DB_UNCACHED ||
1842 db->db_state == DB_NOFILL);
1843 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
1844 if (pio == NULL && (db->db_state == DB_NOFILL ||
1845 (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
1846 spa_t *spa = dn->dn_objset->os_spa;
1847 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1848 need_wait = B_TRUE;
1849 }
1850 err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
1851 /* dbuf_read_impl drops db_mtx and parent's rwlock. */
1852 miss = (db->db_state != DB_CACHED);
1853 }
1854
1855 if (err == 0 && prefetch) {
1856 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
1857 flags & DB_RF_HAVESTRUCT);
1858 }
1859 DB_DNODE_EXIT(db);
1860
1861 /*
1862 * If we created a zio we must execute it to avoid leaking it, even if
1863 * it isn't attached to any work due to an error in dbuf_read_impl().
1864 */
1865 if (need_wait) {
1866 if (err == 0)
1867 err = zio_wait(pio);
1868 else
1869 (void) zio_wait(pio);
1870 pio = NULL;
1871 }
1872
1873 done:
1874 if (miss)
1875 DBUF_STAT_BUMP(hash_misses);
1876 else
1877 DBUF_STAT_BUMP(hash_hits);
1878 if (pio && err != 0) {
1879 zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
1880 ZIO_FLAG_CANFAIL);
1881 zio->io_error = err;
1882 zio_nowait(zio);
1883 }
1884
1885 return (err);
1886 }
1887
1888 static void
dbuf_noread(dmu_buf_impl_t * db)1889 dbuf_noread(dmu_buf_impl_t *db)
1890 {
1891 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1892 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1893 mutex_enter(&db->db_mtx);
1894 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1895 cv_wait(&db->db_changed, &db->db_mtx);
1896 if (db->db_state == DB_UNCACHED) {
1897 ASSERT(db->db_buf == NULL);
1898 ASSERT(db->db.db_data == NULL);
1899 dbuf_set_data(db, dbuf_alloc_arcbuf(db));
1900 db->db_state = DB_FILL;
1901 DTRACE_SET_STATE(db, "assigning filled buffer");
1902 } else if (db->db_state == DB_NOFILL) {
1903 dbuf_clear_data(db);
1904 } else {
1905 ASSERT3U(db->db_state, ==, DB_CACHED);
1906 }
1907 mutex_exit(&db->db_mtx);
1908 }
1909
1910 void
dbuf_unoverride(dbuf_dirty_record_t * dr)1911 dbuf_unoverride(dbuf_dirty_record_t *dr)
1912 {
1913 dmu_buf_impl_t *db = dr->dr_dbuf;
1914 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
1915 uint64_t txg = dr->dr_txg;
1916
1917 ASSERT(MUTEX_HELD(&db->db_mtx));
1918 /*
1919 * This assert is valid because dmu_sync() expects to be called by
1920 * a zilog's get_data while holding a range lock. This call only
1921 * comes from dbuf_dirty() callers who must also hold a range lock.
1922 */
1923 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
1924 ASSERT(db->db_level == 0);
1925
1926 if (db->db_blkid == DMU_BONUS_BLKID ||
1927 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
1928 return;
1929
1930 ASSERT(db->db_data_pending != dr);
1931
1932 /* free this block */
1933 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
1934 zio_free(db->db_objset->os_spa, txg, bp);
1935
1936 if (dr->dt.dl.dr_brtwrite) {
1937 ASSERT0P(dr->dt.dl.dr_data);
1938 dr->dt.dl.dr_data = db->db_buf;
1939 }
1940 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1941 dr->dt.dl.dr_nopwrite = B_FALSE;
1942 dr->dt.dl.dr_brtwrite = B_FALSE;
1943 dr->dt.dl.dr_has_raw_params = B_FALSE;
1944
1945 /*
1946 * Release the already-written buffer, so we leave it in
1947 * a consistent dirty state. Note that all callers are
1948 * modifying the buffer, so they will immediately do
1949 * another (redundant) arc_release(). Therefore, leave
1950 * the buf thawed to save the effort of freezing &
1951 * immediately re-thawing it.
1952 */
1953 if (dr->dt.dl.dr_data)
1954 arc_release(dr->dt.dl.dr_data, db);
1955 }
1956
1957 /*
1958 * Evict (if its unreferenced) or clear (if its referenced) any level-0
1959 * data blocks in the free range, so that any future readers will find
1960 * empty blocks.
1961 */
1962 void
dbuf_free_range(dnode_t * dn,uint64_t start_blkid,uint64_t end_blkid,dmu_tx_t * tx)1963 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1964 dmu_tx_t *tx)
1965 {
1966 dmu_buf_impl_t *db_search;
1967 dmu_buf_impl_t *db, *db_next;
1968 uint64_t txg = tx->tx_txg;
1969 avl_index_t where;
1970 dbuf_dirty_record_t *dr;
1971
1972 if (end_blkid > dn->dn_maxblkid &&
1973 !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
1974 end_blkid = dn->dn_maxblkid;
1975 dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
1976 (u_longlong_t)end_blkid);
1977
1978 db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
1979 db_search->db_level = 0;
1980 db_search->db_blkid = start_blkid;
1981 db_search->db_state = DB_SEARCH;
1982
1983 mutex_enter(&dn->dn_dbufs_mtx);
1984 db = avl_find(&dn->dn_dbufs, db_search, &where);
1985 ASSERT3P(db, ==, NULL);
1986
1987 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1988
1989 for (; db != NULL; db = db_next) {
1990 db_next = AVL_NEXT(&dn->dn_dbufs, db);
1991 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1992
1993 if (db->db_level != 0 || db->db_blkid > end_blkid) {
1994 break;
1995 }
1996 ASSERT3U(db->db_blkid, >=, start_blkid);
1997
1998 /* found a level 0 buffer in the range */
1999 mutex_enter(&db->db_mtx);
2000 if (dbuf_undirty(db, tx)) {
2001 /* mutex has been dropped and dbuf destroyed */
2002 continue;
2003 }
2004
2005 if (db->db_state == DB_UNCACHED ||
2006 db->db_state == DB_NOFILL ||
2007 db->db_state == DB_EVICTING) {
2008 ASSERT(db->db.db_data == NULL);
2009 mutex_exit(&db->db_mtx);
2010 continue;
2011 }
2012 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
2013 /* will be handled in dbuf_read_done or dbuf_rele */
2014 db->db_freed_in_flight = TRUE;
2015 mutex_exit(&db->db_mtx);
2016 continue;
2017 }
2018 if (zfs_refcount_count(&db->db_holds) == 0) {
2019 ASSERT(db->db_buf);
2020 dbuf_destroy(db);
2021 continue;
2022 }
2023 /* The dbuf is referenced */
2024
2025 dr = list_head(&db->db_dirty_records);
2026 if (dr != NULL) {
2027 if (dr->dr_txg == txg) {
2028 /*
2029 * This buffer is "in-use", re-adjust the file
2030 * size to reflect that this buffer may
2031 * contain new data when we sync.
2032 */
2033 if (db->db_blkid != DMU_SPILL_BLKID &&
2034 db->db_blkid > dn->dn_maxblkid)
2035 dn->dn_maxblkid = db->db_blkid;
2036 dbuf_unoverride(dr);
2037 } else {
2038 /*
2039 * This dbuf is not dirty in the open context.
2040 * Either uncache it (if its not referenced in
2041 * the open context) or reset its contents to
2042 * empty.
2043 */
2044 dbuf_fix_old_data(db, txg);
2045 }
2046 }
2047 /* clear the contents if its cached */
2048 if (db->db_state == DB_CACHED) {
2049 ASSERT(db->db.db_data != NULL);
2050 arc_release(db->db_buf, db);
2051 rw_enter(&db->db_rwlock, RW_WRITER);
2052 memset(db->db.db_data, 0, db->db.db_size);
2053 rw_exit(&db->db_rwlock);
2054 arc_buf_freeze(db->db_buf);
2055 }
2056
2057 mutex_exit(&db->db_mtx);
2058 }
2059
2060 mutex_exit(&dn->dn_dbufs_mtx);
2061 kmem_free(db_search, sizeof (dmu_buf_impl_t));
2062 }
2063
2064 void
dbuf_new_size(dmu_buf_impl_t * db,int size,dmu_tx_t * tx)2065 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
2066 {
2067 arc_buf_t *buf, *old_buf;
2068 dbuf_dirty_record_t *dr;
2069 int osize = db->db.db_size;
2070 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2071 dnode_t *dn;
2072
2073 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2074
2075 DB_DNODE_ENTER(db);
2076 dn = DB_DNODE(db);
2077
2078 /*
2079 * XXX we should be doing a dbuf_read, checking the return
2080 * value and returning that up to our callers
2081 */
2082 dmu_buf_will_dirty(&db->db, tx);
2083
2084 /* create the data buffer for the new block */
2085 buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
2086
2087 /* copy old block data to the new block */
2088 old_buf = db->db_buf;
2089 memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
2090 /* zero the remainder */
2091 if (size > osize)
2092 memset((uint8_t *)buf->b_data + osize, 0, size - osize);
2093
2094 mutex_enter(&db->db_mtx);
2095 dbuf_set_data(db, buf);
2096 arc_buf_destroy(old_buf, db);
2097 db->db.db_size = size;
2098
2099 dr = list_head(&db->db_dirty_records);
2100 /* dirty record added by dmu_buf_will_dirty() */
2101 VERIFY(dr != NULL);
2102 if (db->db_level == 0)
2103 dr->dt.dl.dr_data = buf;
2104 ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2105 ASSERT3U(dr->dr_accounted, ==, osize);
2106 dr->dr_accounted = size;
2107 mutex_exit(&db->db_mtx);
2108
2109 dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
2110 DB_DNODE_EXIT(db);
2111 }
2112
2113 void
dbuf_release_bp(dmu_buf_impl_t * db)2114 dbuf_release_bp(dmu_buf_impl_t *db)
2115 {
2116 objset_t *os __maybe_unused = db->db_objset;
2117
2118 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
2119 ASSERT(arc_released(os->os_phys_buf) ||
2120 list_link_active(&os->os_dsl_dataset->ds_synced_link));
2121 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
2122
2123 (void) arc_release(db->db_buf, db);
2124 }
2125
2126 /*
2127 * We already have a dirty record for this TXG, and we are being
2128 * dirtied again.
2129 */
2130 static void
dbuf_redirty(dbuf_dirty_record_t * dr)2131 dbuf_redirty(dbuf_dirty_record_t *dr)
2132 {
2133 dmu_buf_impl_t *db = dr->dr_dbuf;
2134
2135 ASSERT(MUTEX_HELD(&db->db_mtx));
2136
2137 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
2138 /*
2139 * If this buffer has already been written out,
2140 * we now need to reset its state.
2141 */
2142 dbuf_unoverride(dr);
2143 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
2144 db->db_state != DB_NOFILL) {
2145 /* Already released on initial dirty, so just thaw. */
2146 ASSERT(arc_released(db->db_buf));
2147 arc_buf_thaw(db->db_buf);
2148 }
2149 }
2150 }
2151
2152 dbuf_dirty_record_t *
dbuf_dirty_lightweight(dnode_t * dn,uint64_t blkid,dmu_tx_t * tx)2153 dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
2154 {
2155 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2156 IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
2157 dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
2158 ASSERT(dn->dn_maxblkid >= blkid);
2159
2160 dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
2161 list_link_init(&dr->dr_dirty_node);
2162 list_link_init(&dr->dr_dbuf_node);
2163 dr->dr_dnode = dn;
2164 dr->dr_txg = tx->tx_txg;
2165 dr->dt.dll.dr_blkid = blkid;
2166 dr->dr_accounted = dn->dn_datablksz;
2167
2168 /*
2169 * There should not be any dbuf for the block that we're dirtying.
2170 * Otherwise the buffer contents could be inconsistent between the
2171 * dbuf and the lightweight dirty record.
2172 */
2173 ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
2174 NULL));
2175
2176 mutex_enter(&dn->dn_mtx);
2177 int txgoff = tx->tx_txg & TXG_MASK;
2178 if (dn->dn_free_ranges[txgoff] != NULL) {
2179 range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
2180 }
2181
2182 if (dn->dn_nlevels == 1) {
2183 ASSERT3U(blkid, <, dn->dn_nblkptr);
2184 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2185 mutex_exit(&dn->dn_mtx);
2186 rw_exit(&dn->dn_struct_rwlock);
2187 dnode_setdirty(dn, tx);
2188 } else {
2189 mutex_exit(&dn->dn_mtx);
2190
2191 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2192 dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
2193 1, blkid >> epbs, FTAG);
2194 rw_exit(&dn->dn_struct_rwlock);
2195 if (parent_db == NULL) {
2196 kmem_free(dr, sizeof (*dr));
2197 return (NULL);
2198 }
2199 int err = dbuf_read(parent_db, NULL,
2200 (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
2201 if (err != 0) {
2202 dbuf_rele(parent_db, FTAG);
2203 kmem_free(dr, sizeof (*dr));
2204 return (NULL);
2205 }
2206
2207 dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
2208 dbuf_rele(parent_db, FTAG);
2209 mutex_enter(&parent_dr->dt.di.dr_mtx);
2210 ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
2211 list_insert_tail(&parent_dr->dt.di.dr_children, dr);
2212 mutex_exit(&parent_dr->dt.di.dr_mtx);
2213 dr->dr_parent = parent_dr;
2214 }
2215
2216 dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
2217
2218 return (dr);
2219 }
2220
2221 dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t * db,dmu_tx_t * tx)2222 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2223 {
2224 dnode_t *dn;
2225 objset_t *os;
2226 dbuf_dirty_record_t *dr, *dr_next, *dr_head;
2227 int txgoff = tx->tx_txg & TXG_MASK;
2228 boolean_t drop_struct_rwlock = B_FALSE;
2229
2230 ASSERT(tx->tx_txg != 0);
2231 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2232 DMU_TX_DIRTY_BUF(tx, db);
2233
2234 DB_DNODE_ENTER(db);
2235 dn = DB_DNODE(db);
2236 /*
2237 * Shouldn't dirty a regular buffer in syncing context. Private
2238 * objects may be dirtied in syncing context, but only if they
2239 * were already pre-dirtied in open context.
2240 */
2241 #ifdef ZFS_DEBUG
2242 if (dn->dn_objset->os_dsl_dataset != NULL) {
2243 rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
2244 RW_READER, FTAG);
2245 }
2246 ASSERT(!dmu_tx_is_syncing(tx) ||
2247 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
2248 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2249 dn->dn_objset->os_dsl_dataset == NULL);
2250 if (dn->dn_objset->os_dsl_dataset != NULL)
2251 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
2252 #endif
2253 /*
2254 * We make this assert for private objects as well, but after we
2255 * check if we're already dirty. They are allowed to re-dirty
2256 * in syncing context.
2257 */
2258 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2259 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
2260 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
2261
2262 mutex_enter(&db->db_mtx);
2263 /*
2264 * XXX make this true for indirects too? The problem is that
2265 * transactions created with dmu_tx_create_assigned() from
2266 * syncing context don't bother holding ahead.
2267 */
2268 ASSERT(db->db_level != 0 ||
2269 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
2270 db->db_state == DB_NOFILL);
2271
2272 mutex_enter(&dn->dn_mtx);
2273 dnode_set_dirtyctx(dn, tx, db);
2274 if (tx->tx_txg > dn->dn_dirty_txg)
2275 dn->dn_dirty_txg = tx->tx_txg;
2276 mutex_exit(&dn->dn_mtx);
2277
2278 if (db->db_blkid == DMU_SPILL_BLKID)
2279 dn->dn_have_spill = B_TRUE;
2280
2281 /*
2282 * If this buffer is already dirty, we're done.
2283 */
2284 dr_head = list_head(&db->db_dirty_records);
2285 ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
2286 db->db.db_object == DMU_META_DNODE_OBJECT);
2287 dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
2288 if (dr_next && dr_next->dr_txg == tx->tx_txg) {
2289 DB_DNODE_EXIT(db);
2290
2291 dbuf_redirty(dr_next);
2292 mutex_exit(&db->db_mtx);
2293 return (dr_next);
2294 }
2295
2296 /*
2297 * Only valid if not already dirty.
2298 */
2299 ASSERT(dn->dn_object == 0 ||
2300 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
2301 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
2302
2303 ASSERT3U(dn->dn_nlevels, >, db->db_level);
2304
2305 /*
2306 * We should only be dirtying in syncing context if it's the
2307 * mos or we're initializing the os or it's a special object.
2308 * However, we are allowed to dirty in syncing context provided
2309 * we already dirtied it in open context. Hence we must make
2310 * this assertion only if we're not already dirty.
2311 */
2312 os = dn->dn_objset;
2313 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
2314 #ifdef ZFS_DEBUG
2315 if (dn->dn_objset->os_dsl_dataset != NULL)
2316 rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
2317 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2318 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
2319 if (dn->dn_objset->os_dsl_dataset != NULL)
2320 rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
2321 #endif
2322 ASSERT(db->db.db_size != 0);
2323
2324 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2325
2326 if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
2327 dmu_objset_willuse_space(os, db->db.db_size, tx);
2328 }
2329
2330 /*
2331 * If this buffer is dirty in an old transaction group we need
2332 * to make a copy of it so that the changes we make in this
2333 * transaction group won't leak out when we sync the older txg.
2334 */
2335 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
2336 list_link_init(&dr->dr_dirty_node);
2337 list_link_init(&dr->dr_dbuf_node);
2338 dr->dr_dnode = dn;
2339 if (db->db_level == 0) {
2340 void *data_old = db->db_buf;
2341
2342 if (db->db_state != DB_NOFILL) {
2343 if (db->db_blkid == DMU_BONUS_BLKID) {
2344 dbuf_fix_old_data(db, tx->tx_txg);
2345 data_old = db->db.db_data;
2346 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
2347 /*
2348 * Release the data buffer from the cache so
2349 * that we can modify it without impacting
2350 * possible other users of this cached data
2351 * block. Note that indirect blocks and
2352 * private objects are not released until the
2353 * syncing state (since they are only modified
2354 * then).
2355 */
2356 arc_release(db->db_buf, db);
2357 dbuf_fix_old_data(db, tx->tx_txg);
2358 data_old = db->db_buf;
2359 }
2360 ASSERT(data_old != NULL);
2361 }
2362 dr->dt.dl.dr_data = data_old;
2363 } else {
2364 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
2365 list_create(&dr->dt.di.dr_children,
2366 sizeof (dbuf_dirty_record_t),
2367 offsetof(dbuf_dirty_record_t, dr_dirty_node));
2368 }
2369 if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
2370 dr->dr_accounted = db->db.db_size;
2371 }
2372 dr->dr_dbuf = db;
2373 dr->dr_txg = tx->tx_txg;
2374 list_insert_before(&db->db_dirty_records, dr_next, dr);
2375
2376 /*
2377 * We could have been freed_in_flight between the dbuf_noread
2378 * and dbuf_dirty. We win, as though the dbuf_noread() had
2379 * happened after the free.
2380 */
2381 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2382 db->db_blkid != DMU_SPILL_BLKID) {
2383 mutex_enter(&dn->dn_mtx);
2384 if (dn->dn_free_ranges[txgoff] != NULL) {
2385 range_tree_clear(dn->dn_free_ranges[txgoff],
2386 db->db_blkid, 1);
2387 }
2388 mutex_exit(&dn->dn_mtx);
2389 db->db_freed_in_flight = FALSE;
2390 }
2391
2392 /*
2393 * This buffer is now part of this txg
2394 */
2395 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
2396 db->db_dirtycnt += 1;
2397 ASSERT3U(db->db_dirtycnt, <=, 3);
2398
2399 mutex_exit(&db->db_mtx);
2400
2401 if (db->db_blkid == DMU_BONUS_BLKID ||
2402 db->db_blkid == DMU_SPILL_BLKID) {
2403 mutex_enter(&dn->dn_mtx);
2404 ASSERT(!list_link_active(&dr->dr_dirty_node));
2405 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2406 mutex_exit(&dn->dn_mtx);
2407 dnode_setdirty(dn, tx);
2408 DB_DNODE_EXIT(db);
2409 return (dr);
2410 }
2411
2412 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
2413 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2414 drop_struct_rwlock = B_TRUE;
2415 }
2416
2417 /*
2418 * If we are overwriting a dedup BP, then unless it is snapshotted,
2419 * when we get to syncing context we will need to decrement its
2420 * refcount in the DDT. Prefetch the relevant DDT block so that
2421 * syncing context won't have to wait for the i/o.
2422 */
2423 if (db->db_blkptr != NULL) {
2424 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
2425 ddt_prefetch(os->os_spa, db->db_blkptr);
2426 dmu_buf_unlock_parent(db, dblt, FTAG);
2427 }
2428
2429 /*
2430 * We need to hold the dn_struct_rwlock to make this assertion,
2431 * because it protects dn_phys / dn_next_nlevels from changing.
2432 */
2433 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
2434 dn->dn_phys->dn_nlevels > db->db_level ||
2435 dn->dn_next_nlevels[txgoff] > db->db_level ||
2436 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
2437 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
2438
2439
2440 if (db->db_level == 0) {
2441 ASSERT(!db->db_objset->os_raw_receive ||
2442 dn->dn_maxblkid >= db->db_blkid);
2443 dnode_new_blkid(dn, db->db_blkid, tx,
2444 drop_struct_rwlock, B_FALSE);
2445 ASSERT(dn->dn_maxblkid >= db->db_blkid);
2446 }
2447
2448 if (db->db_level+1 < dn->dn_nlevels) {
2449 dmu_buf_impl_t *parent = db->db_parent;
2450 dbuf_dirty_record_t *di;
2451 int parent_held = FALSE;
2452
2453 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
2454 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2455 parent = dbuf_hold_level(dn, db->db_level + 1,
2456 db->db_blkid >> epbs, FTAG);
2457 ASSERT(parent != NULL);
2458 parent_held = TRUE;
2459 }
2460 if (drop_struct_rwlock)
2461 rw_exit(&dn->dn_struct_rwlock);
2462 ASSERT3U(db->db_level + 1, ==, parent->db_level);
2463 di = dbuf_dirty(parent, tx);
2464 if (parent_held)
2465 dbuf_rele(parent, FTAG);
2466
2467 mutex_enter(&db->db_mtx);
2468 /*
2469 * Since we've dropped the mutex, it's possible that
2470 * dbuf_undirty() might have changed this out from under us.
2471 */
2472 if (list_head(&db->db_dirty_records) == dr ||
2473 dn->dn_object == DMU_META_DNODE_OBJECT) {
2474 mutex_enter(&di->dt.di.dr_mtx);
2475 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
2476 ASSERT(!list_link_active(&dr->dr_dirty_node));
2477 list_insert_tail(&di->dt.di.dr_children, dr);
2478 mutex_exit(&di->dt.di.dr_mtx);
2479 dr->dr_parent = di;
2480 }
2481 mutex_exit(&db->db_mtx);
2482 } else {
2483 ASSERT(db->db_level + 1 == dn->dn_nlevels);
2484 ASSERT(db->db_blkid < dn->dn_nblkptr);
2485 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
2486 mutex_enter(&dn->dn_mtx);
2487 ASSERT(!list_link_active(&dr->dr_dirty_node));
2488 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2489 mutex_exit(&dn->dn_mtx);
2490 if (drop_struct_rwlock)
2491 rw_exit(&dn->dn_struct_rwlock);
2492 }
2493
2494 dnode_setdirty(dn, tx);
2495 DB_DNODE_EXIT(db);
2496 return (dr);
2497 }
2498
2499 static void
dbuf_undirty_bonus(dbuf_dirty_record_t * dr)2500 dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
2501 {
2502 dmu_buf_impl_t *db = dr->dr_dbuf;
2503
2504 if (dr->dt.dl.dr_data != db->db.db_data) {
2505 struct dnode *dn = dr->dr_dnode;
2506 int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
2507
2508 kmem_free(dr->dt.dl.dr_data, max_bonuslen);
2509 arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
2510 }
2511 db->db_data_pending = NULL;
2512 ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
2513 list_remove(&db->db_dirty_records, dr);
2514 if (dr->dr_dbuf->db_level != 0) {
2515 mutex_destroy(&dr->dt.di.dr_mtx);
2516 list_destroy(&dr->dt.di.dr_children);
2517 }
2518 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2519 ASSERT3U(db->db_dirtycnt, >, 0);
2520 db->db_dirtycnt -= 1;
2521 }
2522
2523 /*
2524 * Undirty a buffer in the transaction group referenced by the given
2525 * transaction. Return whether this evicted the dbuf.
2526 */
2527 boolean_t
dbuf_undirty(dmu_buf_impl_t * db,dmu_tx_t * tx)2528 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2529 {
2530 uint64_t txg = tx->tx_txg;
2531 boolean_t brtwrite;
2532
2533 ASSERT(txg != 0);
2534
2535 /*
2536 * Due to our use of dn_nlevels below, this can only be called
2537 * in open context, unless we are operating on the MOS.
2538 * From syncing context, dn_nlevels may be different from the
2539 * dn_nlevels used when dbuf was dirtied.
2540 */
2541 ASSERT(db->db_objset ==
2542 dmu_objset_pool(db->db_objset)->dp_meta_objset ||
2543 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
2544 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2545 ASSERT0(db->db_level);
2546 ASSERT(MUTEX_HELD(&db->db_mtx));
2547
2548 /*
2549 * If this buffer is not dirty, we're done.
2550 */
2551 dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
2552 if (dr == NULL)
2553 return (B_FALSE);
2554 ASSERT(dr->dr_dbuf == db);
2555
2556 brtwrite = dr->dt.dl.dr_brtwrite;
2557 if (brtwrite) {
2558 /*
2559 * We are freeing a block that we cloned in the same
2560 * transaction group.
2561 */
2562 brt_pending_remove(dmu_objset_spa(db->db_objset),
2563 &dr->dt.dl.dr_overridden_by, tx);
2564 }
2565
2566 dnode_t *dn = dr->dr_dnode;
2567
2568 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2569
2570 ASSERT(db->db.db_size != 0);
2571
2572 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
2573 dr->dr_accounted, txg);
2574
2575 list_remove(&db->db_dirty_records, dr);
2576
2577 /*
2578 * Note that there are three places in dbuf_dirty()
2579 * where this dirty record may be put on a list.
2580 * Make sure to do a list_remove corresponding to
2581 * every one of those list_insert calls.
2582 */
2583 if (dr->dr_parent) {
2584 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
2585 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
2586 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
2587 } else if (db->db_blkid == DMU_SPILL_BLKID ||
2588 db->db_level + 1 == dn->dn_nlevels) {
2589 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
2590 mutex_enter(&dn->dn_mtx);
2591 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
2592 mutex_exit(&dn->dn_mtx);
2593 }
2594
2595 if (db->db_state != DB_NOFILL && !brtwrite) {
2596 dbuf_unoverride(dr);
2597
2598 ASSERT(db->db_buf != NULL);
2599 ASSERT(dr->dt.dl.dr_data != NULL);
2600 if (dr->dt.dl.dr_data != db->db_buf)
2601 arc_buf_destroy(dr->dt.dl.dr_data, db);
2602 }
2603
2604 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2605
2606 ASSERT(db->db_dirtycnt > 0);
2607 db->db_dirtycnt -= 1;
2608
2609 if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
2610 ASSERT(db->db_state == DB_NOFILL || brtwrite ||
2611 arc_released(db->db_buf));
2612 dbuf_destroy(db);
2613 return (B_TRUE);
2614 }
2615
2616 return (B_FALSE);
2617 }
2618
2619 static void
dmu_buf_will_dirty_impl(dmu_buf_t * db_fake,int flags,dmu_tx_t * tx)2620 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
2621 {
2622 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2623 boolean_t undirty = B_FALSE;
2624
2625 ASSERT(tx->tx_txg != 0);
2626 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2627
2628 /*
2629 * Quick check for dirtiness to improve performance for some workloads
2630 * (e.g. file deletion with indirect blocks cached).
2631 */
2632 mutex_enter(&db->db_mtx);
2633 if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
2634 /*
2635 * It's possible that the dbuf is already dirty but not cached,
2636 * because there are some calls to dbuf_dirty() that don't
2637 * go through dmu_buf_will_dirty().
2638 */
2639 dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2640 if (dr != NULL) {
2641 if (db->db_level == 0 &&
2642 dr->dt.dl.dr_brtwrite) {
2643 /*
2644 * Block cloning: If we are dirtying a cloned
2645 * level 0 block, we cannot simply redirty it,
2646 * because this dr has no associated data.
2647 * We will go through a full undirtying below,
2648 * before dirtying it again.
2649 */
2650 undirty = B_TRUE;
2651 } else {
2652 /* This dbuf is already dirty and cached. */
2653 dbuf_redirty(dr);
2654 mutex_exit(&db->db_mtx);
2655 return;
2656 }
2657 }
2658 }
2659 mutex_exit(&db->db_mtx);
2660
2661 DB_DNODE_ENTER(db);
2662 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
2663 flags |= DB_RF_HAVESTRUCT;
2664 DB_DNODE_EXIT(db);
2665
2666 /*
2667 * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
2668 * want to make sure dbuf_read() will read the pending cloned block and
2669 * not the uderlying block that is being replaced. dbuf_undirty() will
2670 * do dbuf_unoverride(), so we will end up with cloned block content,
2671 * without overridden BP.
2672 */
2673 (void) dbuf_read(db, NULL, flags);
2674 if (undirty) {
2675 mutex_enter(&db->db_mtx);
2676 VERIFY(!dbuf_undirty(db, tx));
2677 mutex_exit(&db->db_mtx);
2678 }
2679 (void) dbuf_dirty(db, tx);
2680 }
2681
2682 void
dmu_buf_will_dirty(dmu_buf_t * db_fake,dmu_tx_t * tx)2683 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2684 {
2685 dmu_buf_will_dirty_impl(db_fake,
2686 DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
2687 }
2688
2689 boolean_t
dmu_buf_is_dirty(dmu_buf_t * db_fake,dmu_tx_t * tx)2690 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2691 {
2692 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2693 dbuf_dirty_record_t *dr;
2694
2695 mutex_enter(&db->db_mtx);
2696 dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2697 mutex_exit(&db->db_mtx);
2698 return (dr != NULL);
2699 }
2700
2701 void
dmu_buf_will_clone(dmu_buf_t * db_fake,dmu_tx_t * tx)2702 dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
2703 {
2704 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2705
2706 /*
2707 * Block cloning: We are going to clone into this block, so undirty
2708 * modifications done to this block so far in this txg. This includes
2709 * writes and clones into this block.
2710 */
2711 mutex_enter(&db->db_mtx);
2712 DBUF_VERIFY(db);
2713 VERIFY(!dbuf_undirty(db, tx));
2714 ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
2715 if (db->db_buf != NULL) {
2716 arc_buf_destroy(db->db_buf, db);
2717 db->db_buf = NULL;
2718 dbuf_clear_data(db);
2719 }
2720
2721 db->db_state = DB_NOFILL;
2722 DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
2723
2724 DBUF_VERIFY(db);
2725 mutex_exit(&db->db_mtx);
2726
2727 dbuf_noread(db);
2728 (void) dbuf_dirty(db, tx);
2729 }
2730
2731 void
dmu_buf_will_not_fill(dmu_buf_t * db_fake,dmu_tx_t * tx)2732 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2733 {
2734 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2735
2736 mutex_enter(&db->db_mtx);
2737 db->db_state = DB_NOFILL;
2738 DTRACE_SET_STATE(db, "allocating NOFILL buffer");
2739 mutex_exit(&db->db_mtx);
2740
2741 dbuf_noread(db);
2742 (void) dbuf_dirty(db, tx);
2743 }
2744
2745 void
dmu_buf_will_fill(dmu_buf_t * db_fake,dmu_tx_t * tx,boolean_t canfail)2746 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
2747 {
2748 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2749
2750 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2751 ASSERT(tx->tx_txg != 0);
2752 ASSERT(db->db_level == 0);
2753 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2754
2755 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
2756 dmu_tx_private_ok(tx));
2757
2758 mutex_enter(&db->db_mtx);
2759 if (db->db_state == DB_NOFILL) {
2760 /*
2761 * Block cloning: We will be completely overwriting a block
2762 * cloned in this transaction group, so let's undirty the
2763 * pending clone and mark the block as uncached. This will be
2764 * as if the clone was never done. But if the fill can fail
2765 * we should have a way to return back to the cloned data.
2766 */
2767 if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
2768 mutex_exit(&db->db_mtx);
2769 dmu_buf_will_dirty(db_fake, tx);
2770 return;
2771 }
2772 VERIFY(!dbuf_undirty(db, tx));
2773 db->db_state = DB_UNCACHED;
2774 }
2775 mutex_exit(&db->db_mtx);
2776
2777 dbuf_noread(db);
2778 (void) dbuf_dirty(db, tx);
2779 }
2780
2781 /*
2782 * This function is effectively the same as dmu_buf_will_dirty(), but
2783 * indicates the caller expects raw encrypted data in the db, and provides
2784 * the crypt params (byteorder, salt, iv, mac) which should be stored in the
2785 * blkptr_t when this dbuf is written. This is only used for blocks of
2786 * dnodes, during raw receive.
2787 */
2788 void
dmu_buf_set_crypt_params(dmu_buf_t * db_fake,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_tx_t * tx)2789 dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
2790 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
2791 {
2792 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2793 dbuf_dirty_record_t *dr;
2794
2795 /*
2796 * dr_has_raw_params is only processed for blocks of dnodes
2797 * (see dbuf_sync_dnode_leaf_crypt()).
2798 */
2799 ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
2800 ASSERT3U(db->db_level, ==, 0);
2801 ASSERT(db->db_objset->os_raw_receive);
2802
2803 dmu_buf_will_dirty_impl(db_fake,
2804 DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
2805
2806 dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2807
2808 ASSERT3P(dr, !=, NULL);
2809
2810 dr->dt.dl.dr_has_raw_params = B_TRUE;
2811 dr->dt.dl.dr_byteorder = byteorder;
2812 memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
2813 memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
2814 memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
2815 }
2816
2817 static void
dbuf_override_impl(dmu_buf_impl_t * db,const blkptr_t * bp,dmu_tx_t * tx)2818 dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
2819 {
2820 struct dirty_leaf *dl;
2821 dbuf_dirty_record_t *dr;
2822
2823 dr = list_head(&db->db_dirty_records);
2824 ASSERT3P(dr, !=, NULL);
2825 ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2826 dl = &dr->dt.dl;
2827 dl->dr_overridden_by = *bp;
2828 dl->dr_override_state = DR_OVERRIDDEN;
2829 BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
2830 }
2831
2832 boolean_t
dmu_buf_fill_done(dmu_buf_t * dbuf,dmu_tx_t * tx,boolean_t failed)2833 dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
2834 {
2835 (void) tx;
2836 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2837 mutex_enter(&db->db_mtx);
2838 DBUF_VERIFY(db);
2839
2840 if (db->db_state == DB_FILL) {
2841 if (db->db_level == 0 && db->db_freed_in_flight) {
2842 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2843 /* we were freed while filling */
2844 /* XXX dbuf_undirty? */
2845 memset(db->db.db_data, 0, db->db.db_size);
2846 db->db_freed_in_flight = FALSE;
2847 db->db_state = DB_CACHED;
2848 DTRACE_SET_STATE(db,
2849 "fill done handling freed in flight");
2850 failed = B_FALSE;
2851 } else if (failed) {
2852 VERIFY(!dbuf_undirty(db, tx));
2853 db->db_buf = NULL;
2854 dbuf_clear_data(db);
2855 DTRACE_SET_STATE(db, "fill failed");
2856 } else {
2857 db->db_state = DB_CACHED;
2858 DTRACE_SET_STATE(db, "fill done");
2859 }
2860 cv_broadcast(&db->db_changed);
2861 } else {
2862 db->db_state = DB_CACHED;
2863 failed = B_FALSE;
2864 }
2865 mutex_exit(&db->db_mtx);
2866 return (failed);
2867 }
2868
2869 void
dmu_buf_write_embedded(dmu_buf_t * dbuf,void * data,bp_embedded_type_t etype,enum zio_compress comp,int uncompressed_size,int compressed_size,int byteorder,dmu_tx_t * tx)2870 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
2871 bp_embedded_type_t etype, enum zio_compress comp,
2872 int uncompressed_size, int compressed_size, int byteorder,
2873 dmu_tx_t *tx)
2874 {
2875 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2876 struct dirty_leaf *dl;
2877 dmu_object_type_t type;
2878 dbuf_dirty_record_t *dr;
2879
2880 if (etype == BP_EMBEDDED_TYPE_DATA) {
2881 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
2882 SPA_FEATURE_EMBEDDED_DATA));
2883 }
2884
2885 DB_DNODE_ENTER(db);
2886 type = DB_DNODE(db)->dn_type;
2887 DB_DNODE_EXIT(db);
2888
2889 ASSERT0(db->db_level);
2890 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2891
2892 dmu_buf_will_not_fill(dbuf, tx);
2893
2894 dr = list_head(&db->db_dirty_records);
2895 ASSERT3P(dr, !=, NULL);
2896 ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2897 dl = &dr->dt.dl;
2898 encode_embedded_bp_compressed(&dl->dr_overridden_by,
2899 data, comp, uncompressed_size, compressed_size);
2900 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
2901 BP_SET_TYPE(&dl->dr_overridden_by, type);
2902 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
2903 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
2904
2905 dl->dr_override_state = DR_OVERRIDDEN;
2906 BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
2907 }
2908
2909 void
dmu_buf_redact(dmu_buf_t * dbuf,dmu_tx_t * tx)2910 dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
2911 {
2912 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2913 dmu_object_type_t type;
2914 ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
2915 SPA_FEATURE_REDACTED_DATASETS));
2916
2917 DB_DNODE_ENTER(db);
2918 type = DB_DNODE(db)->dn_type;
2919 DB_DNODE_EXIT(db);
2920
2921 ASSERT0(db->db_level);
2922 dmu_buf_will_not_fill(dbuf, tx);
2923
2924 blkptr_t bp = { { { {0} } } };
2925 BP_SET_TYPE(&bp, type);
2926 BP_SET_LEVEL(&bp, 0);
2927 BP_SET_BIRTH(&bp, tx->tx_txg, 0);
2928 BP_SET_REDACTED(&bp);
2929 BPE_SET_LSIZE(&bp, dbuf->db_size);
2930
2931 dbuf_override_impl(db, &bp, tx);
2932 }
2933
2934 /*
2935 * Directly assign a provided arc buf to a given dbuf if it's not referenced
2936 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
2937 */
2938 void
dbuf_assign_arcbuf(dmu_buf_impl_t * db,arc_buf_t * buf,dmu_tx_t * tx)2939 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
2940 {
2941 ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2942 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2943 ASSERT(db->db_level == 0);
2944 ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
2945 ASSERT(buf != NULL);
2946 ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
2947 ASSERT(tx->tx_txg != 0);
2948
2949 arc_return_buf(buf, db);
2950 ASSERT(arc_released(buf));
2951
2952 mutex_enter(&db->db_mtx);
2953
2954 while (db->db_state == DB_READ || db->db_state == DB_FILL)
2955 cv_wait(&db->db_changed, &db->db_mtx);
2956
2957 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
2958 db->db_state == DB_NOFILL);
2959
2960 if (db->db_state == DB_CACHED &&
2961 zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
2962 /*
2963 * In practice, we will never have a case where we have an
2964 * encrypted arc buffer while additional holds exist on the
2965 * dbuf. We don't handle this here so we simply assert that
2966 * fact instead.
2967 */
2968 ASSERT(!arc_is_encrypted(buf));
2969 mutex_exit(&db->db_mtx);
2970 (void) dbuf_dirty(db, tx);
2971 memcpy(db->db.db_data, buf->b_data, db->db.db_size);
2972 arc_buf_destroy(buf, db);
2973 return;
2974 }
2975
2976 if (db->db_state == DB_CACHED) {
2977 dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
2978
2979 ASSERT(db->db_buf != NULL);
2980 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
2981 ASSERT(dr->dt.dl.dr_data == db->db_buf);
2982
2983 if (!arc_released(db->db_buf)) {
2984 ASSERT(dr->dt.dl.dr_override_state ==
2985 DR_OVERRIDDEN);
2986 arc_release(db->db_buf, db);
2987 }
2988 dr->dt.dl.dr_data = buf;
2989 arc_buf_destroy(db->db_buf, db);
2990 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
2991 arc_release(db->db_buf, db);
2992 arc_buf_destroy(db->db_buf, db);
2993 }
2994 db->db_buf = NULL;
2995 } else if (db->db_state == DB_NOFILL) {
2996 /*
2997 * We will be completely replacing the cloned block. In case
2998 * it was cloned in this transaction group, let's undirty the
2999 * pending clone and mark the block as uncached. This will be
3000 * as if the clone was never done.
3001 */
3002 VERIFY(!dbuf_undirty(db, tx));
3003 db->db_state = DB_UNCACHED;
3004 }
3005 ASSERT(db->db_buf == NULL);
3006 dbuf_set_data(db, buf);
3007 db->db_state = DB_FILL;
3008 DTRACE_SET_STATE(db, "filling assigned arcbuf");
3009 mutex_exit(&db->db_mtx);
3010 (void) dbuf_dirty(db, tx);
3011 dmu_buf_fill_done(&db->db, tx, B_FALSE);
3012 }
3013
3014 void
dbuf_destroy(dmu_buf_impl_t * db)3015 dbuf_destroy(dmu_buf_impl_t *db)
3016 {
3017 dnode_t *dn;
3018 dmu_buf_impl_t *parent = db->db_parent;
3019 dmu_buf_impl_t *dndb;
3020
3021 ASSERT(MUTEX_HELD(&db->db_mtx));
3022 ASSERT(zfs_refcount_is_zero(&db->db_holds));
3023
3024 if (db->db_buf != NULL) {
3025 arc_buf_destroy(db->db_buf, db);
3026 db->db_buf = NULL;
3027 }
3028
3029 if (db->db_blkid == DMU_BONUS_BLKID) {
3030 int slots = DB_DNODE(db)->dn_num_slots;
3031 int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
3032 if (db->db.db_data != NULL) {
3033 kmem_free(db->db.db_data, bonuslen);
3034 arc_space_return(bonuslen, ARC_SPACE_BONUS);
3035 db->db_state = DB_UNCACHED;
3036 DTRACE_SET_STATE(db, "buffer cleared");
3037 }
3038 }
3039
3040 dbuf_clear_data(db);
3041
3042 if (multilist_link_active(&db->db_cache_link)) {
3043 ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
3044 db->db_caching_status == DB_DBUF_METADATA_CACHE);
3045
3046 multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3047
3048 ASSERT0(dmu_buf_user_size(&db->db));
3049 (void) zfs_refcount_remove_many(
3050 &dbuf_caches[db->db_caching_status].size,
3051 db->db.db_size, db);
3052
3053 if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
3054 DBUF_STAT_BUMPDOWN(metadata_cache_count);
3055 } else {
3056 DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
3057 DBUF_STAT_BUMPDOWN(cache_count);
3058 DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
3059 db->db.db_size);
3060 }
3061 db->db_caching_status = DB_NO_CACHE;
3062 }
3063
3064 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
3065 ASSERT(db->db_data_pending == NULL);
3066 ASSERT(list_is_empty(&db->db_dirty_records));
3067
3068 db->db_state = DB_EVICTING;
3069 DTRACE_SET_STATE(db, "buffer eviction started");
3070 db->db_blkptr = NULL;
3071
3072 /*
3073 * Now that db_state is DB_EVICTING, nobody else can find this via
3074 * the hash table. We can now drop db_mtx, which allows us to
3075 * acquire the dn_dbufs_mtx.
3076 */
3077 mutex_exit(&db->db_mtx);
3078
3079 DB_DNODE_ENTER(db);
3080 dn = DB_DNODE(db);
3081 dndb = dn->dn_dbuf;
3082 if (db->db_blkid != DMU_BONUS_BLKID) {
3083 boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
3084 if (needlock)
3085 mutex_enter_nested(&dn->dn_dbufs_mtx,
3086 NESTED_SINGLE);
3087 avl_remove(&dn->dn_dbufs, db);
3088 membar_producer();
3089 DB_DNODE_EXIT(db);
3090 if (needlock)
3091 mutex_exit(&dn->dn_dbufs_mtx);
3092 /*
3093 * Decrementing the dbuf count means that the hold corresponding
3094 * to the removed dbuf is no longer discounted in dnode_move(),
3095 * so the dnode cannot be moved until after we release the hold.
3096 * The membar_producer() ensures visibility of the decremented
3097 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
3098 * release any lock.
3099 */
3100 mutex_enter(&dn->dn_mtx);
3101 dnode_rele_and_unlock(dn, db, B_TRUE);
3102 db->db_dnode_handle = NULL;
3103
3104 dbuf_hash_remove(db);
3105 } else {
3106 DB_DNODE_EXIT(db);
3107 }
3108
3109 ASSERT(zfs_refcount_is_zero(&db->db_holds));
3110
3111 db->db_parent = NULL;
3112
3113 ASSERT(db->db_buf == NULL);
3114 ASSERT(db->db.db_data == NULL);
3115 ASSERT(db->db_hash_next == NULL);
3116 ASSERT(db->db_blkptr == NULL);
3117 ASSERT(db->db_data_pending == NULL);
3118 ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
3119 ASSERT(!multilist_link_active(&db->db_cache_link));
3120
3121 /*
3122 * If this dbuf is referenced from an indirect dbuf,
3123 * decrement the ref count on the indirect dbuf.
3124 */
3125 if (parent && parent != dndb) {
3126 mutex_enter(&parent->db_mtx);
3127 dbuf_rele_and_unlock(parent, db, B_TRUE);
3128 }
3129
3130 kmem_cache_free(dbuf_kmem_cache, db);
3131 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3132 }
3133
3134 /*
3135 * Note: While bpp will always be updated if the function returns success,
3136 * parentp will not be updated if the dnode does not have dn_dbuf filled in;
3137 * this happens when the dnode is the meta-dnode, or {user|group|project}used
3138 * object.
3139 */
3140 __attribute__((always_inline))
3141 static inline int
dbuf_findbp(dnode_t * dn,int level,uint64_t blkid,int fail_sparse,dmu_buf_impl_t ** parentp,blkptr_t ** bpp)3142 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
3143 dmu_buf_impl_t **parentp, blkptr_t **bpp)
3144 {
3145 *parentp = NULL;
3146 *bpp = NULL;
3147
3148 ASSERT(blkid != DMU_BONUS_BLKID);
3149
3150 if (blkid == DMU_SPILL_BLKID) {
3151 mutex_enter(&dn->dn_mtx);
3152 if (dn->dn_have_spill &&
3153 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
3154 *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
3155 else
3156 *bpp = NULL;
3157 dbuf_add_ref(dn->dn_dbuf, NULL);
3158 *parentp = dn->dn_dbuf;
3159 mutex_exit(&dn->dn_mtx);
3160 return (0);
3161 }
3162
3163 int nlevels =
3164 (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
3165 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
3166
3167 ASSERT3U(level * epbs, <, 64);
3168 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3169 /*
3170 * This assertion shouldn't trip as long as the max indirect block size
3171 * is less than 1M. The reason for this is that up to that point,
3172 * the number of levels required to address an entire object with blocks
3173 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
3174 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
3175 * (i.e. we can address the entire object), objects will all use at most
3176 * N-1 levels and the assertion won't overflow. However, once epbs is
3177 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
3178 * enough to address an entire object, so objects will have 5 levels,
3179 * but then this assertion will overflow.
3180 *
3181 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
3182 * need to redo this logic to handle overflows.
3183 */
3184 ASSERT(level >= nlevels ||
3185 ((nlevels - level - 1) * epbs) +
3186 highbit64(dn->dn_phys->dn_nblkptr) <= 64);
3187 if (level >= nlevels ||
3188 blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
3189 ((nlevels - level - 1) * epbs)) ||
3190 (fail_sparse &&
3191 blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
3192 /* the buffer has no parent yet */
3193 return (SET_ERROR(ENOENT));
3194 } else if (level < nlevels-1) {
3195 /* this block is referenced from an indirect block */
3196 int err;
3197
3198 err = dbuf_hold_impl(dn, level + 1,
3199 blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
3200
3201 if (err)
3202 return (err);
3203 err = dbuf_read(*parentp, NULL,
3204 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
3205 if (err) {
3206 dbuf_rele(*parentp, NULL);
3207 *parentp = NULL;
3208 return (err);
3209 }
3210 rw_enter(&(*parentp)->db_rwlock, RW_READER);
3211 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
3212 (blkid & ((1ULL << epbs) - 1));
3213 if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
3214 ASSERT(BP_IS_HOLE(*bpp));
3215 rw_exit(&(*parentp)->db_rwlock);
3216 return (0);
3217 } else {
3218 /* the block is referenced from the dnode */
3219 ASSERT3U(level, ==, nlevels-1);
3220 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
3221 blkid < dn->dn_phys->dn_nblkptr);
3222 if (dn->dn_dbuf) {
3223 dbuf_add_ref(dn->dn_dbuf, NULL);
3224 *parentp = dn->dn_dbuf;
3225 }
3226 *bpp = &dn->dn_phys->dn_blkptr[blkid];
3227 return (0);
3228 }
3229 }
3230
3231 static dmu_buf_impl_t *
dbuf_create(dnode_t * dn,uint8_t level,uint64_t blkid,dmu_buf_impl_t * parent,blkptr_t * blkptr,uint64_t hash)3232 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
3233 dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
3234 {
3235 objset_t *os = dn->dn_objset;
3236 dmu_buf_impl_t *db, *odb;
3237
3238 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3239 ASSERT(dn->dn_type != DMU_OT_NONE);
3240
3241 db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
3242
3243 list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
3244 offsetof(dbuf_dirty_record_t, dr_dbuf_node));
3245
3246 db->db_objset = os;
3247 db->db.db_object = dn->dn_object;
3248 db->db_level = level;
3249 db->db_blkid = blkid;
3250 db->db_dirtycnt = 0;
3251 db->db_dnode_handle = dn->dn_handle;
3252 db->db_parent = parent;
3253 db->db_blkptr = blkptr;
3254 db->db_hash = hash;
3255
3256 db->db_user = NULL;
3257 db->db_user_immediate_evict = FALSE;
3258 db->db_freed_in_flight = FALSE;
3259 db->db_pending_evict = FALSE;
3260
3261 if (blkid == DMU_BONUS_BLKID) {
3262 ASSERT3P(parent, ==, dn->dn_dbuf);
3263 db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
3264 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
3265 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
3266 db->db.db_offset = DMU_BONUS_BLKID;
3267 db->db_state = DB_UNCACHED;
3268 DTRACE_SET_STATE(db, "bonus buffer created");
3269 db->db_caching_status = DB_NO_CACHE;
3270 /* the bonus dbuf is not placed in the hash table */
3271 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3272 return (db);
3273 } else if (blkid == DMU_SPILL_BLKID) {
3274 db->db.db_size = (blkptr != NULL) ?
3275 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
3276 db->db.db_offset = 0;
3277 } else {
3278 int blocksize =
3279 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
3280 db->db.db_size = blocksize;
3281 db->db.db_offset = db->db_blkid * blocksize;
3282 }
3283
3284 /*
3285 * Hold the dn_dbufs_mtx while we get the new dbuf
3286 * in the hash table *and* added to the dbufs list.
3287 * This prevents a possible deadlock with someone
3288 * trying to look up this dbuf before it's added to the
3289 * dn_dbufs list.
3290 */
3291 mutex_enter(&dn->dn_dbufs_mtx);
3292 db->db_state = DB_EVICTING; /* not worth logging this state change */
3293 if ((odb = dbuf_hash_insert(db)) != NULL) {
3294 /* someone else inserted it first */
3295 mutex_exit(&dn->dn_dbufs_mtx);
3296 kmem_cache_free(dbuf_kmem_cache, db);
3297 DBUF_STAT_BUMP(hash_insert_race);
3298 return (odb);
3299 }
3300 avl_add(&dn->dn_dbufs, db);
3301
3302 db->db_state = DB_UNCACHED;
3303 DTRACE_SET_STATE(db, "regular buffer created");
3304 db->db_caching_status = DB_NO_CACHE;
3305 mutex_exit(&dn->dn_dbufs_mtx);
3306 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3307
3308 if (parent && parent != dn->dn_dbuf)
3309 dbuf_add_ref(parent, db);
3310
3311 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
3312 zfs_refcount_count(&dn->dn_holds) > 0);
3313 (void) zfs_refcount_add(&dn->dn_holds, db);
3314
3315 dprintf_dbuf(db, "db=%p\n", db);
3316
3317 return (db);
3318 }
3319
3320 /*
3321 * This function returns a block pointer and information about the object,
3322 * given a dnode and a block. This is a publicly accessible version of
3323 * dbuf_findbp that only returns some information, rather than the
3324 * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock
3325 * should be locked as (at least) a reader.
3326 */
3327 int
dbuf_dnode_findbp(dnode_t * dn,uint64_t level,uint64_t blkid,blkptr_t * bp,uint16_t * datablkszsec,uint8_t * indblkshift)3328 dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
3329 blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
3330 {
3331 dmu_buf_impl_t *dbp = NULL;
3332 blkptr_t *bp2;
3333 int err = 0;
3334 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3335
3336 err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
3337 if (err == 0) {
3338 ASSERT3P(bp2, !=, NULL);
3339 *bp = *bp2;
3340 if (dbp != NULL)
3341 dbuf_rele(dbp, NULL);
3342 if (datablkszsec != NULL)
3343 *datablkszsec = dn->dn_phys->dn_datablkszsec;
3344 if (indblkshift != NULL)
3345 *indblkshift = dn->dn_phys->dn_indblkshift;
3346 }
3347
3348 return (err);
3349 }
3350
3351 typedef struct dbuf_prefetch_arg {
3352 spa_t *dpa_spa; /* The spa to issue the prefetch in. */
3353 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
3354 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
3355 int dpa_curlevel; /* The current level that we're reading */
3356 dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
3357 zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
3358 zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
3359 arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
3360 dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
3361 void *dpa_arg; /* prefetch completion arg */
3362 } dbuf_prefetch_arg_t;
3363
3364 static void
dbuf_prefetch_fini(dbuf_prefetch_arg_t * dpa,boolean_t io_done)3365 dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
3366 {
3367 if (dpa->dpa_cb != NULL) {
3368 dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
3369 dpa->dpa_zb.zb_blkid, io_done);
3370 }
3371 kmem_free(dpa, sizeof (*dpa));
3372 }
3373
3374 static void
dbuf_issue_final_prefetch_done(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * iobp,arc_buf_t * abuf,void * private)3375 dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
3376 const blkptr_t *iobp, arc_buf_t *abuf, void *private)
3377 {
3378 (void) zio, (void) zb, (void) iobp;
3379 dbuf_prefetch_arg_t *dpa = private;
3380
3381 if (abuf != NULL)
3382 arc_buf_destroy(abuf, private);
3383
3384 dbuf_prefetch_fini(dpa, B_TRUE);
3385 }
3386
3387 /*
3388 * Actually issue the prefetch read for the block given.
3389 */
3390 static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t * dpa,blkptr_t * bp)3391 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
3392 {
3393 ASSERT(!BP_IS_REDACTED(bp) ||
3394 dsl_dataset_feature_is_active(
3395 dpa->dpa_dnode->dn_objset->os_dsl_dataset,
3396 SPA_FEATURE_REDACTED_DATASETS));
3397
3398 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
3399 return (dbuf_prefetch_fini(dpa, B_FALSE));
3400
3401 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
3402 arc_flags_t aflags =
3403 dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
3404 ARC_FLAG_NO_BUF;
3405
3406 /* dnodes are always read as raw and then converted later */
3407 if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
3408 dpa->dpa_curlevel == 0)
3409 zio_flags |= ZIO_FLAG_RAW;
3410
3411 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3412 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
3413 ASSERT(dpa->dpa_zio != NULL);
3414 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
3415 dbuf_issue_final_prefetch_done, dpa,
3416 dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
3417 }
3418
3419 /*
3420 * Called when an indirect block above our prefetch target is read in. This
3421 * will either read in the next indirect block down the tree or issue the actual
3422 * prefetch if the next block down is our target.
3423 */
3424 static void
dbuf_prefetch_indirect_done(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * iobp,arc_buf_t * abuf,void * private)3425 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
3426 const blkptr_t *iobp, arc_buf_t *abuf, void *private)
3427 {
3428 (void) zb, (void) iobp;
3429 dbuf_prefetch_arg_t *dpa = private;
3430
3431 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
3432 ASSERT3S(dpa->dpa_curlevel, >, 0);
3433
3434 if (abuf == NULL) {
3435 ASSERT(zio == NULL || zio->io_error != 0);
3436 dbuf_prefetch_fini(dpa, B_TRUE);
3437 return;
3438 }
3439 ASSERT(zio == NULL || zio->io_error == 0);
3440
3441 /*
3442 * The dpa_dnode is only valid if we are called with a NULL
3443 * zio. This indicates that the arc_read() returned without
3444 * first calling zio_read() to issue a physical read. Once
3445 * a physical read is made the dpa_dnode must be invalidated
3446 * as the locks guarding it may have been dropped. If the
3447 * dpa_dnode is still valid, then we want to add it to the dbuf
3448 * cache. To do so, we must hold the dbuf associated with the block
3449 * we just prefetched, read its contents so that we associate it
3450 * with an arc_buf_t, and then release it.
3451 */
3452 if (zio != NULL) {
3453 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
3454 if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
3455 ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
3456 } else {
3457 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
3458 }
3459 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
3460
3461 dpa->dpa_dnode = NULL;
3462 } else if (dpa->dpa_dnode != NULL) {
3463 uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
3464 (dpa->dpa_epbs * (dpa->dpa_curlevel -
3465 dpa->dpa_zb.zb_level));
3466 dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
3467 dpa->dpa_curlevel, curblkid, FTAG);
3468 if (db == NULL) {
3469 arc_buf_destroy(abuf, private);
3470 dbuf_prefetch_fini(dpa, B_TRUE);
3471 return;
3472 }
3473 (void) dbuf_read(db, NULL,
3474 DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
3475 dbuf_rele(db, FTAG);
3476 }
3477
3478 dpa->dpa_curlevel--;
3479 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
3480 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
3481 blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
3482 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
3483
3484 ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
3485 dsl_dataset_feature_is_active(
3486 dpa->dpa_dnode->dn_objset->os_dsl_dataset,
3487 SPA_FEATURE_REDACTED_DATASETS)));
3488 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
3489 arc_buf_destroy(abuf, private);
3490 dbuf_prefetch_fini(dpa, B_TRUE);
3491 return;
3492 } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
3493 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
3494 dbuf_issue_final_prefetch(dpa, bp);
3495 } else {
3496 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3497 zbookmark_phys_t zb;
3498
3499 /* flag if L2ARC eligible, l2arc_noprefetch then decides */
3500 if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
3501 iter_aflags |= ARC_FLAG_L2CACHE;
3502
3503 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3504
3505 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
3506 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
3507
3508 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3509 bp, dbuf_prefetch_indirect_done, dpa,
3510 ZIO_PRIORITY_SYNC_READ,
3511 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3512 &iter_aflags, &zb);
3513 }
3514
3515 arc_buf_destroy(abuf, private);
3516 }
3517
3518 /*
3519 * Issue prefetch reads for the given block on the given level. If the indirect
3520 * blocks above that block are not in memory, we will read them in
3521 * asynchronously. As a result, this call never blocks waiting for a read to
3522 * complete. Note that the prefetch might fail if the dataset is encrypted and
3523 * the encryption key is unmapped before the IO completes.
3524 */
3525 int
dbuf_prefetch_impl(dnode_t * dn,int64_t level,uint64_t blkid,zio_priority_t prio,arc_flags_t aflags,dbuf_prefetch_fn cb,void * arg)3526 dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
3527 zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
3528 void *arg)
3529 {
3530 blkptr_t bp;
3531 int epbs, nlevels, curlevel;
3532 uint64_t curblkid;
3533
3534 ASSERT(blkid != DMU_BONUS_BLKID);
3535 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3536
3537 if (blkid > dn->dn_maxblkid)
3538 goto no_issue;
3539
3540 if (level == 0 && dnode_block_freed(dn, blkid))
3541 goto no_issue;
3542
3543 /*
3544 * This dnode hasn't been written to disk yet, so there's nothing to
3545 * prefetch.
3546 */
3547 nlevels = dn->dn_phys->dn_nlevels;
3548 if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
3549 goto no_issue;
3550
3551 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3552 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
3553 goto no_issue;
3554
3555 dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
3556 level, blkid, NULL);
3557 if (db != NULL) {
3558 mutex_exit(&db->db_mtx);
3559 /*
3560 * This dbuf already exists. It is either CACHED, or
3561 * (we assume) about to be read or filled.
3562 */
3563 goto no_issue;
3564 }
3565
3566 /*
3567 * Find the closest ancestor (indirect block) of the target block
3568 * that is present in the cache. In this indirect block, we will
3569 * find the bp that is at curlevel, curblkid.
3570 */
3571 curlevel = level;
3572 curblkid = blkid;
3573 while (curlevel < nlevels - 1) {
3574 int parent_level = curlevel + 1;
3575 uint64_t parent_blkid = curblkid >> epbs;
3576 dmu_buf_impl_t *db;
3577
3578 if (dbuf_hold_impl(dn, parent_level, parent_blkid,
3579 FALSE, TRUE, FTAG, &db) == 0) {
3580 blkptr_t *bpp = db->db_buf->b_data;
3581 bp = bpp[P2PHASE(curblkid, 1 << epbs)];
3582 dbuf_rele(db, FTAG);
3583 break;
3584 }
3585
3586 curlevel = parent_level;
3587 curblkid = parent_blkid;
3588 }
3589
3590 if (curlevel == nlevels - 1) {
3591 /* No cached indirect blocks found. */
3592 ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
3593 bp = dn->dn_phys->dn_blkptr[curblkid];
3594 }
3595 ASSERT(!BP_IS_REDACTED(&bp) ||
3596 dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
3597 SPA_FEATURE_REDACTED_DATASETS));
3598 if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
3599 goto no_issue;
3600
3601 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
3602
3603 zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
3604 ZIO_FLAG_CANFAIL);
3605
3606 dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
3607 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
3608 SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3609 dn->dn_object, level, blkid);
3610 dpa->dpa_curlevel = curlevel;
3611 dpa->dpa_prio = prio;
3612 dpa->dpa_aflags = aflags;
3613 dpa->dpa_spa = dn->dn_objset->os_spa;
3614 dpa->dpa_dnode = dn;
3615 dpa->dpa_epbs = epbs;
3616 dpa->dpa_zio = pio;
3617 dpa->dpa_cb = cb;
3618 dpa->dpa_arg = arg;
3619
3620 if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
3621 dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
3622 else if (dnode_level_is_l2cacheable(&bp, dn, level))
3623 dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
3624
3625 /*
3626 * If we have the indirect just above us, no need to do the asynchronous
3627 * prefetch chain; we'll just run the last step ourselves. If we're at
3628 * a higher level, though, we want to issue the prefetches for all the
3629 * indirect blocks asynchronously, so we can go on with whatever we were
3630 * doing.
3631 */
3632 if (curlevel == level) {
3633 ASSERT3U(curblkid, ==, blkid);
3634 dbuf_issue_final_prefetch(dpa, &bp);
3635 } else {
3636 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3637 zbookmark_phys_t zb;
3638
3639 /* flag if L2ARC eligible, l2arc_noprefetch then decides */
3640 if (dnode_level_is_l2cacheable(&bp, dn, level))
3641 iter_aflags |= ARC_FLAG_L2CACHE;
3642
3643 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3644 dn->dn_object, curlevel, curblkid);
3645 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3646 &bp, dbuf_prefetch_indirect_done, dpa,
3647 ZIO_PRIORITY_SYNC_READ,
3648 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3649 &iter_aflags, &zb);
3650 }
3651 /*
3652 * We use pio here instead of dpa_zio since it's possible that
3653 * dpa may have already been freed.
3654 */
3655 zio_nowait(pio);
3656 return (1);
3657 no_issue:
3658 if (cb != NULL)
3659 cb(arg, level, blkid, B_FALSE);
3660 return (0);
3661 }
3662
3663 int
dbuf_prefetch(dnode_t * dn,int64_t level,uint64_t blkid,zio_priority_t prio,arc_flags_t aflags)3664 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
3665 arc_flags_t aflags)
3666 {
3667
3668 return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
3669 }
3670
3671 /*
3672 * Helper function for dbuf_hold_impl() to copy a buffer. Handles
3673 * the case of encrypted, compressed and uncompressed buffers by
3674 * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
3675 * arc_alloc_compressed_buf() or arc_alloc_buf().*
3676 *
3677 * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
3678 */
3679 noinline static void
dbuf_hold_copy(dnode_t * dn,dmu_buf_impl_t * db)3680 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
3681 {
3682 dbuf_dirty_record_t *dr = db->db_data_pending;
3683 arc_buf_t *data = dr->dt.dl.dr_data;
3684 enum zio_compress compress_type = arc_get_compression(data);
3685 uint8_t complevel = arc_get_complevel(data);
3686
3687 if (arc_is_encrypted(data)) {
3688 boolean_t byteorder;
3689 uint8_t salt[ZIO_DATA_SALT_LEN];
3690 uint8_t iv[ZIO_DATA_IV_LEN];
3691 uint8_t mac[ZIO_DATA_MAC_LEN];
3692
3693 arc_get_raw_params(data, &byteorder, salt, iv, mac);
3694 dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
3695 dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
3696 dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
3697 compress_type, complevel));
3698 } else if (compress_type != ZIO_COMPRESS_OFF) {
3699 dbuf_set_data(db, arc_alloc_compressed_buf(
3700 dn->dn_objset->os_spa, db, arc_buf_size(data),
3701 arc_buf_lsize(data), compress_type, complevel));
3702 } else {
3703 dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
3704 DBUF_GET_BUFC_TYPE(db), db->db.db_size));
3705 }
3706
3707 rw_enter(&db->db_rwlock, RW_WRITER);
3708 memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
3709 rw_exit(&db->db_rwlock);
3710 }
3711
3712 /*
3713 * Returns with db_holds incremented, and db_mtx not held.
3714 * Note: dn_struct_rwlock must be held.
3715 */
3716 int
dbuf_hold_impl(dnode_t * dn,uint8_t level,uint64_t blkid,boolean_t fail_sparse,boolean_t fail_uncached,const void * tag,dmu_buf_impl_t ** dbp)3717 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
3718 boolean_t fail_sparse, boolean_t fail_uncached,
3719 const void *tag, dmu_buf_impl_t **dbp)
3720 {
3721 dmu_buf_impl_t *db, *parent = NULL;
3722 uint64_t hv;
3723
3724 /* If the pool has been created, verify the tx_sync_lock is not held */
3725 spa_t *spa = dn->dn_objset->os_spa;
3726 dsl_pool_t *dp = spa->spa_dsl_pool;
3727 if (dp != NULL) {
3728 ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
3729 }
3730
3731 ASSERT(blkid != DMU_BONUS_BLKID);
3732 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3733 ASSERT3U(dn->dn_nlevels, >, level);
3734
3735 *dbp = NULL;
3736
3737 /* dbuf_find() returns with db_mtx held */
3738 db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
3739
3740 if (db == NULL) {
3741 blkptr_t *bp = NULL;
3742 int err;
3743
3744 if (fail_uncached)
3745 return (SET_ERROR(ENOENT));
3746
3747 ASSERT3P(parent, ==, NULL);
3748 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
3749 if (fail_sparse) {
3750 if (err == 0 && bp && BP_IS_HOLE(bp))
3751 err = SET_ERROR(ENOENT);
3752 if (err) {
3753 if (parent)
3754 dbuf_rele(parent, NULL);
3755 return (err);
3756 }
3757 }
3758 if (err && err != ENOENT)
3759 return (err);
3760 db = dbuf_create(dn, level, blkid, parent, bp, hv);
3761 }
3762
3763 if (fail_uncached && db->db_state != DB_CACHED) {
3764 mutex_exit(&db->db_mtx);
3765 return (SET_ERROR(ENOENT));
3766 }
3767
3768 if (db->db_buf != NULL) {
3769 arc_buf_access(db->db_buf);
3770 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
3771 }
3772
3773 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
3774
3775 /*
3776 * If this buffer is currently syncing out, and we are
3777 * still referencing it from db_data, we need to make a copy
3778 * of it in case we decide we want to dirty it again in this txg.
3779 */
3780 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
3781 dn->dn_object != DMU_META_DNODE_OBJECT &&
3782 db->db_state == DB_CACHED && db->db_data_pending) {
3783 dbuf_dirty_record_t *dr = db->db_data_pending;
3784 if (dr->dt.dl.dr_data == db->db_buf) {
3785 ASSERT3P(db->db_buf, !=, NULL);
3786 dbuf_hold_copy(dn, db);
3787 }
3788 }
3789
3790 if (multilist_link_active(&db->db_cache_link)) {
3791 ASSERT(zfs_refcount_is_zero(&db->db_holds));
3792 ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
3793 db->db_caching_status == DB_DBUF_METADATA_CACHE);
3794
3795 multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3796
3797 uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
3798 (void) zfs_refcount_remove_many(
3799 &dbuf_caches[db->db_caching_status].size, size, db);
3800
3801 if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
3802 DBUF_STAT_BUMPDOWN(metadata_cache_count);
3803 } else {
3804 DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
3805 DBUF_STAT_BUMPDOWN(cache_count);
3806 DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
3807 }
3808 db->db_caching_status = DB_NO_CACHE;
3809 }
3810 (void) zfs_refcount_add(&db->db_holds, tag);
3811 DBUF_VERIFY(db);
3812 mutex_exit(&db->db_mtx);
3813
3814 /* NOTE: we can't rele the parent until after we drop the db_mtx */
3815 if (parent)
3816 dbuf_rele(parent, NULL);
3817
3818 ASSERT3P(DB_DNODE(db), ==, dn);
3819 ASSERT3U(db->db_blkid, ==, blkid);
3820 ASSERT3U(db->db_level, ==, level);
3821 *dbp = db;
3822
3823 return (0);
3824 }
3825
3826 dmu_buf_impl_t *
dbuf_hold(dnode_t * dn,uint64_t blkid,const void * tag)3827 dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
3828 {
3829 return (dbuf_hold_level(dn, 0, blkid, tag));
3830 }
3831
3832 dmu_buf_impl_t *
dbuf_hold_level(dnode_t * dn,int level,uint64_t blkid,const void * tag)3833 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
3834 {
3835 dmu_buf_impl_t *db;
3836 int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
3837 return (err ? NULL : db);
3838 }
3839
3840 void
dbuf_create_bonus(dnode_t * dn)3841 dbuf_create_bonus(dnode_t *dn)
3842 {
3843 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
3844
3845 ASSERT(dn->dn_bonus == NULL);
3846 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
3847 dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
3848 }
3849
3850 int
dbuf_spill_set_blksz(dmu_buf_t * db_fake,uint64_t blksz,dmu_tx_t * tx)3851 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
3852 {
3853 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3854
3855 if (db->db_blkid != DMU_SPILL_BLKID)
3856 return (SET_ERROR(ENOTSUP));
3857 if (blksz == 0)
3858 blksz = SPA_MINBLOCKSIZE;
3859 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
3860 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
3861
3862 dbuf_new_size(db, blksz, tx);
3863
3864 return (0);
3865 }
3866
3867 void
dbuf_rm_spill(dnode_t * dn,dmu_tx_t * tx)3868 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
3869 {
3870 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
3871 }
3872
3873 #pragma weak dmu_buf_add_ref = dbuf_add_ref
3874 void
dbuf_add_ref(dmu_buf_impl_t * db,const void * tag)3875 dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
3876 {
3877 int64_t holds = zfs_refcount_add(&db->db_holds, tag);
3878 VERIFY3S(holds, >, 1);
3879 }
3880
3881 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
3882 boolean_t
dbuf_try_add_ref(dmu_buf_t * db_fake,objset_t * os,uint64_t obj,uint64_t blkid,const void * tag)3883 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
3884 const void *tag)
3885 {
3886 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3887 dmu_buf_impl_t *found_db;
3888 boolean_t result = B_FALSE;
3889
3890 if (blkid == DMU_BONUS_BLKID)
3891 found_db = dbuf_find_bonus(os, obj);
3892 else
3893 found_db = dbuf_find(os, obj, 0, blkid, NULL);
3894
3895 if (found_db != NULL) {
3896 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
3897 (void) zfs_refcount_add(&db->db_holds, tag);
3898 result = B_TRUE;
3899 }
3900 mutex_exit(&found_db->db_mtx);
3901 }
3902 return (result);
3903 }
3904
3905 /*
3906 * If you call dbuf_rele() you had better not be referencing the dnode handle
3907 * unless you have some other direct or indirect hold on the dnode. (An indirect
3908 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
3909 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
3910 * dnode's parent dbuf evicting its dnode handles.
3911 */
3912 void
dbuf_rele(dmu_buf_impl_t * db,const void * tag)3913 dbuf_rele(dmu_buf_impl_t *db, const void *tag)
3914 {
3915 mutex_enter(&db->db_mtx);
3916 dbuf_rele_and_unlock(db, tag, B_FALSE);
3917 }
3918
3919 void
dmu_buf_rele(dmu_buf_t * db,const void * tag)3920 dmu_buf_rele(dmu_buf_t *db, const void *tag)
3921 {
3922 dbuf_rele((dmu_buf_impl_t *)db, tag);
3923 }
3924
3925 /*
3926 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
3927 * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
3928 * argument should be set if we are already in the dbuf-evicting code
3929 * path, in which case we don't want to recursively evict. This allows us to
3930 * avoid deeply nested stacks that would have a call flow similar to this:
3931 *
3932 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
3933 * ^ |
3934 * | |
3935 * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
3936 *
3937 */
3938 void
dbuf_rele_and_unlock(dmu_buf_impl_t * db,const void * tag,boolean_t evicting)3939 dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
3940 {
3941 int64_t holds;
3942 uint64_t size;
3943
3944 ASSERT(MUTEX_HELD(&db->db_mtx));
3945 DBUF_VERIFY(db);
3946
3947 /*
3948 * Remove the reference to the dbuf before removing its hold on the
3949 * dnode so we can guarantee in dnode_move() that a referenced bonus
3950 * buffer has a corresponding dnode hold.
3951 */
3952 holds = zfs_refcount_remove(&db->db_holds, tag);
3953 ASSERT(holds >= 0);
3954
3955 /*
3956 * We can't freeze indirects if there is a possibility that they
3957 * may be modified in the current syncing context.
3958 */
3959 if (db->db_buf != NULL &&
3960 holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
3961 arc_buf_freeze(db->db_buf);
3962 }
3963
3964 if (holds == db->db_dirtycnt &&
3965 db->db_level == 0 && db->db_user_immediate_evict)
3966 dbuf_evict_user(db);
3967
3968 if (holds == 0) {
3969 if (db->db_blkid == DMU_BONUS_BLKID) {
3970 dnode_t *dn;
3971 boolean_t evict_dbuf = db->db_pending_evict;
3972
3973 /*
3974 * If the dnode moves here, we cannot cross this
3975 * barrier until the move completes.
3976 */
3977 DB_DNODE_ENTER(db);
3978
3979 dn = DB_DNODE(db);
3980 atomic_dec_32(&dn->dn_dbufs_count);
3981
3982 /*
3983 * Decrementing the dbuf count means that the bonus
3984 * buffer's dnode hold is no longer discounted in
3985 * dnode_move(). The dnode cannot move until after
3986 * the dnode_rele() below.
3987 */
3988 DB_DNODE_EXIT(db);
3989
3990 /*
3991 * Do not reference db after its lock is dropped.
3992 * Another thread may evict it.
3993 */
3994 mutex_exit(&db->db_mtx);
3995
3996 if (evict_dbuf)
3997 dnode_evict_bonus(dn);
3998
3999 dnode_rele(dn, db);
4000 } else if (db->db_buf == NULL) {
4001 /*
4002 * This is a special case: we never associated this
4003 * dbuf with any data allocated from the ARC.
4004 */
4005 ASSERT(db->db_state == DB_UNCACHED ||
4006 db->db_state == DB_NOFILL);
4007 dbuf_destroy(db);
4008 } else if (arc_released(db->db_buf)) {
4009 /*
4010 * This dbuf has anonymous data associated with it.
4011 */
4012 dbuf_destroy(db);
4013 } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
4014 db->db_pending_evict) {
4015 dbuf_destroy(db);
4016 } else if (!multilist_link_active(&db->db_cache_link)) {
4017 ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4018
4019 dbuf_cached_state_t dcs =
4020 dbuf_include_in_metadata_cache(db) ?
4021 DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
4022 db->db_caching_status = dcs;
4023
4024 multilist_insert(&dbuf_caches[dcs].cache, db);
4025 uint64_t db_size = db->db.db_size +
4026 dmu_buf_user_size(&db->db);
4027 size = zfs_refcount_add_many(
4028 &dbuf_caches[dcs].size, db_size, db);
4029 uint8_t db_level = db->db_level;
4030 mutex_exit(&db->db_mtx);
4031
4032 if (dcs == DB_DBUF_METADATA_CACHE) {
4033 DBUF_STAT_BUMP(metadata_cache_count);
4034 DBUF_STAT_MAX(metadata_cache_size_bytes_max,
4035 size);
4036 } else {
4037 DBUF_STAT_BUMP(cache_count);
4038 DBUF_STAT_MAX(cache_size_bytes_max, size);
4039 DBUF_STAT_BUMP(cache_levels[db_level]);
4040 DBUF_STAT_INCR(cache_levels_bytes[db_level],
4041 db_size);
4042 }
4043
4044 if (dcs == DB_DBUF_CACHE && !evicting)
4045 dbuf_evict_notify(size);
4046 }
4047 } else {
4048 mutex_exit(&db->db_mtx);
4049 }
4050
4051 }
4052
4053 #pragma weak dmu_buf_refcount = dbuf_refcount
4054 uint64_t
dbuf_refcount(dmu_buf_impl_t * db)4055 dbuf_refcount(dmu_buf_impl_t *db)
4056 {
4057 return (zfs_refcount_count(&db->db_holds));
4058 }
4059
4060 uint64_t
dmu_buf_user_refcount(dmu_buf_t * db_fake)4061 dmu_buf_user_refcount(dmu_buf_t *db_fake)
4062 {
4063 uint64_t holds;
4064 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4065
4066 mutex_enter(&db->db_mtx);
4067 ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
4068 holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
4069 mutex_exit(&db->db_mtx);
4070
4071 return (holds);
4072 }
4073
4074 void *
dmu_buf_replace_user(dmu_buf_t * db_fake,dmu_buf_user_t * old_user,dmu_buf_user_t * new_user)4075 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
4076 dmu_buf_user_t *new_user)
4077 {
4078 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4079
4080 mutex_enter(&db->db_mtx);
4081 dbuf_verify_user(db, DBVU_NOT_EVICTING);
4082 if (db->db_user == old_user)
4083 db->db_user = new_user;
4084 else
4085 old_user = db->db_user;
4086 dbuf_verify_user(db, DBVU_NOT_EVICTING);
4087 mutex_exit(&db->db_mtx);
4088
4089 return (old_user);
4090 }
4091
4092 void *
dmu_buf_set_user(dmu_buf_t * db_fake,dmu_buf_user_t * user)4093 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
4094 {
4095 return (dmu_buf_replace_user(db_fake, NULL, user));
4096 }
4097
4098 void *
dmu_buf_set_user_ie(dmu_buf_t * db_fake,dmu_buf_user_t * user)4099 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
4100 {
4101 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4102
4103 db->db_user_immediate_evict = TRUE;
4104 return (dmu_buf_set_user(db_fake, user));
4105 }
4106
4107 void *
dmu_buf_remove_user(dmu_buf_t * db_fake,dmu_buf_user_t * user)4108 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
4109 {
4110 return (dmu_buf_replace_user(db_fake, user, NULL));
4111 }
4112
4113 void *
dmu_buf_get_user(dmu_buf_t * db_fake)4114 dmu_buf_get_user(dmu_buf_t *db_fake)
4115 {
4116 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4117
4118 dbuf_verify_user(db, DBVU_NOT_EVICTING);
4119 return (db->db_user);
4120 }
4121
4122 uint64_t
dmu_buf_user_size(dmu_buf_t * db_fake)4123 dmu_buf_user_size(dmu_buf_t *db_fake)
4124 {
4125 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4126 if (db->db_user == NULL)
4127 return (0);
4128 return (atomic_load_64(&db->db_user->dbu_size));
4129 }
4130
4131 void
dmu_buf_add_user_size(dmu_buf_t * db_fake,uint64_t nadd)4132 dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
4133 {
4134 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4135 ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4136 ASSERT3P(db->db_user, !=, NULL);
4137 ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
4138 atomic_add_64(&db->db_user->dbu_size, nadd);
4139 }
4140
4141 void
dmu_buf_sub_user_size(dmu_buf_t * db_fake,uint64_t nsub)4142 dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
4143 {
4144 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4145 ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4146 ASSERT3P(db->db_user, !=, NULL);
4147 ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
4148 atomic_sub_64(&db->db_user->dbu_size, nsub);
4149 }
4150
4151 void
dmu_buf_user_evict_wait(void)4152 dmu_buf_user_evict_wait(void)
4153 {
4154 taskq_wait(dbu_evict_taskq);
4155 }
4156
4157 blkptr_t *
dmu_buf_get_blkptr(dmu_buf_t * db)4158 dmu_buf_get_blkptr(dmu_buf_t *db)
4159 {
4160 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
4161 return (dbi->db_blkptr);
4162 }
4163
4164 objset_t *
dmu_buf_get_objset(dmu_buf_t * db)4165 dmu_buf_get_objset(dmu_buf_t *db)
4166 {
4167 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
4168 return (dbi->db_objset);
4169 }
4170
4171 static void
dbuf_check_blkptr(dnode_t * dn,dmu_buf_impl_t * db)4172 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
4173 {
4174 /* ASSERT(dmu_tx_is_syncing(tx) */
4175 ASSERT(MUTEX_HELD(&db->db_mtx));
4176
4177 if (db->db_blkptr != NULL)
4178 return;
4179
4180 if (db->db_blkid == DMU_SPILL_BLKID) {
4181 db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
4182 BP_ZERO(db->db_blkptr);
4183 return;
4184 }
4185 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
4186 /*
4187 * This buffer was allocated at a time when there was
4188 * no available blkptrs from the dnode, or it was
4189 * inappropriate to hook it in (i.e., nlevels mismatch).
4190 */
4191 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
4192 ASSERT(db->db_parent == NULL);
4193 db->db_parent = dn->dn_dbuf;
4194 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
4195 DBUF_VERIFY(db);
4196 } else {
4197 dmu_buf_impl_t *parent = db->db_parent;
4198 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
4199
4200 ASSERT(dn->dn_phys->dn_nlevels > 1);
4201 if (parent == NULL) {
4202 mutex_exit(&db->db_mtx);
4203 rw_enter(&dn->dn_struct_rwlock, RW_READER);
4204 parent = dbuf_hold_level(dn, db->db_level + 1,
4205 db->db_blkid >> epbs, db);
4206 rw_exit(&dn->dn_struct_rwlock);
4207 mutex_enter(&db->db_mtx);
4208 db->db_parent = parent;
4209 }
4210 db->db_blkptr = (blkptr_t *)parent->db.db_data +
4211 (db->db_blkid & ((1ULL << epbs) - 1));
4212 DBUF_VERIFY(db);
4213 }
4214 }
4215
4216 static void
dbuf_sync_bonus(dbuf_dirty_record_t * dr,dmu_tx_t * tx)4217 dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4218 {
4219 dmu_buf_impl_t *db = dr->dr_dbuf;
4220 void *data = dr->dt.dl.dr_data;
4221
4222 ASSERT0(db->db_level);
4223 ASSERT(MUTEX_HELD(&db->db_mtx));
4224 ASSERT(db->db_blkid == DMU_BONUS_BLKID);
4225 ASSERT(data != NULL);
4226
4227 dnode_t *dn = dr->dr_dnode;
4228 ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
4229 DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
4230 memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
4231
4232 dbuf_sync_leaf_verify_bonus_dnode(dr);
4233
4234 dbuf_undirty_bonus(dr);
4235 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
4236 }
4237
4238 /*
4239 * When syncing out a blocks of dnodes, adjust the block to deal with
4240 * encryption. Normally, we make sure the block is decrypted before writing
4241 * it. If we have crypt params, then we are writing a raw (encrypted) block,
4242 * from a raw receive. In this case, set the ARC buf's crypt params so
4243 * that the BP will be filled with the correct byteorder, salt, iv, and mac.
4244 */
4245 static void
dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t * dr)4246 dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
4247 {
4248 int err;
4249 dmu_buf_impl_t *db = dr->dr_dbuf;
4250
4251 ASSERT(MUTEX_HELD(&db->db_mtx));
4252 ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
4253 ASSERT3U(db->db_level, ==, 0);
4254
4255 if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
4256 zbookmark_phys_t zb;
4257
4258 /*
4259 * Unfortunately, there is currently no mechanism for
4260 * syncing context to handle decryption errors. An error
4261 * here is only possible if an attacker maliciously
4262 * changed a dnode block and updated the associated
4263 * checksums going up the block tree.
4264 */
4265 SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
4266 db->db.db_object, db->db_level, db->db_blkid);
4267 err = arc_untransform(db->db_buf, db->db_objset->os_spa,
4268 &zb, B_TRUE);
4269 if (err)
4270 panic("Invalid dnode block MAC");
4271 } else if (dr->dt.dl.dr_has_raw_params) {
4272 (void) arc_release(dr->dt.dl.dr_data, db);
4273 arc_convert_to_raw(dr->dt.dl.dr_data,
4274 dmu_objset_id(db->db_objset),
4275 dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
4276 dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
4277 }
4278 }
4279
4280 /*
4281 * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
4282 * is critical the we not allow the compiler to inline this function in to
4283 * dbuf_sync_list() thereby drastically bloating the stack usage.
4284 */
4285 noinline static void
dbuf_sync_indirect(dbuf_dirty_record_t * dr,dmu_tx_t * tx)4286 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4287 {
4288 dmu_buf_impl_t *db = dr->dr_dbuf;
4289 dnode_t *dn = dr->dr_dnode;
4290
4291 ASSERT(dmu_tx_is_syncing(tx));
4292
4293 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
4294
4295 mutex_enter(&db->db_mtx);
4296
4297 ASSERT(db->db_level > 0);
4298 DBUF_VERIFY(db);
4299
4300 /* Read the block if it hasn't been read yet. */
4301 if (db->db_buf == NULL) {
4302 mutex_exit(&db->db_mtx);
4303 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
4304 mutex_enter(&db->db_mtx);
4305 }
4306 ASSERT3U(db->db_state, ==, DB_CACHED);
4307 ASSERT(db->db_buf != NULL);
4308
4309 /* Indirect block size must match what the dnode thinks it is. */
4310 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4311 dbuf_check_blkptr(dn, db);
4312
4313 /* Provide the pending dirty record to child dbufs */
4314 db->db_data_pending = dr;
4315
4316 mutex_exit(&db->db_mtx);
4317
4318 dbuf_write(dr, db->db_buf, tx);
4319
4320 zio_t *zio = dr->dr_zio;
4321 mutex_enter(&dr->dt.di.dr_mtx);
4322 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
4323 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
4324 mutex_exit(&dr->dt.di.dr_mtx);
4325 zio_nowait(zio);
4326 }
4327
4328 /*
4329 * Verify that the size of the data in our bonus buffer does not exceed
4330 * its recorded size.
4331 *
4332 * The purpose of this verification is to catch any cases in development
4333 * where the size of a phys structure (i.e space_map_phys_t) grows and,
4334 * due to incorrect feature management, older pools expect to read more
4335 * data even though they didn't actually write it to begin with.
4336 *
4337 * For a example, this would catch an error in the feature logic where we
4338 * open an older pool and we expect to write the space map histogram of
4339 * a space map with size SPACE_MAP_SIZE_V0.
4340 */
4341 static void
dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t * dr)4342 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
4343 {
4344 #ifdef ZFS_DEBUG
4345 dnode_t *dn = dr->dr_dnode;
4346
4347 /*
4348 * Encrypted bonus buffers can have data past their bonuslen.
4349 * Skip the verification of these blocks.
4350 */
4351 if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
4352 return;
4353
4354 uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
4355 uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
4356 ASSERT3U(bonuslen, <=, maxbonuslen);
4357
4358 arc_buf_t *datap = dr->dt.dl.dr_data;
4359 char *datap_end = ((char *)datap) + bonuslen;
4360 char *datap_max = ((char *)datap) + maxbonuslen;
4361
4362 /* ensure that everything is zero after our data */
4363 for (; datap_end < datap_max; datap_end++)
4364 ASSERT(*datap_end == 0);
4365 #endif
4366 }
4367
4368 static blkptr_t *
dbuf_lightweight_bp(dbuf_dirty_record_t * dr)4369 dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
4370 {
4371 /* This must be a lightweight dirty record. */
4372 ASSERT3P(dr->dr_dbuf, ==, NULL);
4373 dnode_t *dn = dr->dr_dnode;
4374
4375 if (dn->dn_phys->dn_nlevels == 1) {
4376 VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
4377 return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
4378 } else {
4379 dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
4380 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
4381 VERIFY3U(parent_db->db_level, ==, 1);
4382 VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
4383 VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
4384 blkptr_t *bp = parent_db->db.db_data;
4385 return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
4386 }
4387 }
4388
4389 static void
dbuf_lightweight_ready(zio_t * zio)4390 dbuf_lightweight_ready(zio_t *zio)
4391 {
4392 dbuf_dirty_record_t *dr = zio->io_private;
4393 blkptr_t *bp = zio->io_bp;
4394
4395 if (zio->io_error != 0)
4396 return;
4397
4398 dnode_t *dn = dr->dr_dnode;
4399
4400 blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
4401 spa_t *spa = dmu_objset_spa(dn->dn_objset);
4402 int64_t delta = bp_get_dsize_sync(spa, bp) -
4403 bp_get_dsize_sync(spa, bp_orig);
4404 dnode_diduse_space(dn, delta);
4405
4406 uint64_t blkid = dr->dt.dll.dr_blkid;
4407 mutex_enter(&dn->dn_mtx);
4408 if (blkid > dn->dn_phys->dn_maxblkid) {
4409 ASSERT0(dn->dn_objset->os_raw_receive);
4410 dn->dn_phys->dn_maxblkid = blkid;
4411 }
4412 mutex_exit(&dn->dn_mtx);
4413
4414 if (!BP_IS_EMBEDDED(bp)) {
4415 uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
4416 BP_SET_FILL(bp, fill);
4417 }
4418
4419 dmu_buf_impl_t *parent_db;
4420 EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4421 if (dr->dr_parent == NULL) {
4422 parent_db = dn->dn_dbuf;
4423 } else {
4424 parent_db = dr->dr_parent->dr_dbuf;
4425 }
4426 rw_enter(&parent_db->db_rwlock, RW_WRITER);
4427 *bp_orig = *bp;
4428 rw_exit(&parent_db->db_rwlock);
4429 }
4430
4431 static void
dbuf_lightweight_done(zio_t * zio)4432 dbuf_lightweight_done(zio_t *zio)
4433 {
4434 dbuf_dirty_record_t *dr = zio->io_private;
4435
4436 VERIFY0(zio->io_error);
4437
4438 objset_t *os = dr->dr_dnode->dn_objset;
4439 dmu_tx_t *tx = os->os_synctx;
4440
4441 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
4442 ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
4443 } else {
4444 dsl_dataset_t *ds = os->os_dsl_dataset;
4445 (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
4446 dsl_dataset_block_born(ds, zio->io_bp, tx);
4447 }
4448
4449 dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
4450 zio->io_txg);
4451
4452 abd_free(dr->dt.dll.dr_abd);
4453 kmem_free(dr, sizeof (*dr));
4454 }
4455
4456 noinline static void
dbuf_sync_lightweight(dbuf_dirty_record_t * dr,dmu_tx_t * tx)4457 dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4458 {
4459 dnode_t *dn = dr->dr_dnode;
4460 zio_t *pio;
4461 if (dn->dn_phys->dn_nlevels == 1) {
4462 pio = dn->dn_zio;
4463 } else {
4464 pio = dr->dr_parent->dr_zio;
4465 }
4466
4467 zbookmark_phys_t zb = {
4468 .zb_objset = dmu_objset_id(dn->dn_objset),
4469 .zb_object = dn->dn_object,
4470 .zb_level = 0,
4471 .zb_blkid = dr->dt.dll.dr_blkid,
4472 };
4473
4474 /*
4475 * See comment in dbuf_write(). This is so that zio->io_bp_orig
4476 * will have the old BP in dbuf_lightweight_done().
4477 */
4478 dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
4479
4480 dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
4481 dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
4482 dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
4483 &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
4484 dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
4485 ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
4486
4487 zio_nowait(dr->dr_zio);
4488 }
4489
4490 /*
4491 * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
4492 * critical the we not allow the compiler to inline this function in to
4493 * dbuf_sync_list() thereby drastically bloating the stack usage.
4494 */
4495 noinline static void
dbuf_sync_leaf(dbuf_dirty_record_t * dr,dmu_tx_t * tx)4496 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4497 {
4498 arc_buf_t **datap = &dr->dt.dl.dr_data;
4499 dmu_buf_impl_t *db = dr->dr_dbuf;
4500 dnode_t *dn = dr->dr_dnode;
4501 objset_t *os;
4502 uint64_t txg = tx->tx_txg;
4503
4504 ASSERT(dmu_tx_is_syncing(tx));
4505
4506 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
4507
4508 mutex_enter(&db->db_mtx);
4509 /*
4510 * To be synced, we must be dirtied. But we
4511 * might have been freed after the dirty.
4512 */
4513 if (db->db_state == DB_UNCACHED) {
4514 /* This buffer has been freed since it was dirtied */
4515 ASSERT(db->db.db_data == NULL);
4516 } else if (db->db_state == DB_FILL) {
4517 /* This buffer was freed and is now being re-filled */
4518 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
4519 } else if (db->db_state == DB_READ) {
4520 /*
4521 * This buffer has a clone we need to write, and an in-flight
4522 * read on the BP we're about to clone. Its safe to issue the
4523 * write here because the read has already been issued and the
4524 * contents won't change.
4525 */
4526 ASSERT(dr->dt.dl.dr_brtwrite &&
4527 dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
4528 } else {
4529 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
4530 }
4531 DBUF_VERIFY(db);
4532
4533 if (db->db_blkid == DMU_SPILL_BLKID) {
4534 mutex_enter(&dn->dn_mtx);
4535 if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
4536 /*
4537 * In the previous transaction group, the bonus buffer
4538 * was entirely used to store the attributes for the
4539 * dnode which overrode the dn_spill field. However,
4540 * when adding more attributes to the file a spill
4541 * block was required to hold the extra attributes.
4542 *
4543 * Make sure to clear the garbage left in the dn_spill
4544 * field from the previous attributes in the bonus
4545 * buffer. Otherwise, after writing out the spill
4546 * block to the new allocated dva, it will free
4547 * the old block pointed to by the invalid dn_spill.
4548 */
4549 db->db_blkptr = NULL;
4550 }
4551 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
4552 mutex_exit(&dn->dn_mtx);
4553 }
4554
4555 /*
4556 * If this is a bonus buffer, simply copy the bonus data into the
4557 * dnode. It will be written out when the dnode is synced (and it
4558 * will be synced, since it must have been dirty for dbuf_sync to
4559 * be called).
4560 */
4561 if (db->db_blkid == DMU_BONUS_BLKID) {
4562 ASSERT(dr->dr_dbuf == db);
4563 dbuf_sync_bonus(dr, tx);
4564 return;
4565 }
4566
4567 os = dn->dn_objset;
4568
4569 /*
4570 * This function may have dropped the db_mtx lock allowing a dmu_sync
4571 * operation to sneak in. As a result, we need to ensure that we
4572 * don't check the dr_override_state until we have returned from
4573 * dbuf_check_blkptr.
4574 */
4575 dbuf_check_blkptr(dn, db);
4576
4577 /*
4578 * If this buffer is in the middle of an immediate write,
4579 * wait for the synchronous IO to complete.
4580 */
4581 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
4582 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
4583 cv_wait(&db->db_changed, &db->db_mtx);
4584 }
4585
4586 /*
4587 * If this is a dnode block, ensure it is appropriately encrypted
4588 * or decrypted, depending on what we are writing to it this txg.
4589 */
4590 if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
4591 dbuf_prepare_encrypted_dnode_leaf(dr);
4592
4593 if (*datap != NULL && *datap == db->db_buf &&
4594 dn->dn_object != DMU_META_DNODE_OBJECT &&
4595 zfs_refcount_count(&db->db_holds) > 1 &&
4596 dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
4597 /*
4598 * If this buffer is currently "in use" (i.e., there
4599 * are active holds and db_data still references it),
4600 * then make a copy before we start the write so that
4601 * any modifications from the open txg will not leak
4602 * into this write.
4603 *
4604 * NOTE: this copy does not need to be made for
4605 * objects only modified in the syncing context (e.g.
4606 * DNONE_DNODE blocks).
4607 */
4608 int psize = arc_buf_size(*datap);
4609 int lsize = arc_buf_lsize(*datap);
4610 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
4611 enum zio_compress compress_type = arc_get_compression(*datap);
4612 uint8_t complevel = arc_get_complevel(*datap);
4613
4614 if (arc_is_encrypted(*datap)) {
4615 boolean_t byteorder;
4616 uint8_t salt[ZIO_DATA_SALT_LEN];
4617 uint8_t iv[ZIO_DATA_IV_LEN];
4618 uint8_t mac[ZIO_DATA_MAC_LEN];
4619
4620 arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
4621 *datap = arc_alloc_raw_buf(os->os_spa, db,
4622 dmu_objset_id(os), byteorder, salt, iv, mac,
4623 dn->dn_type, psize, lsize, compress_type,
4624 complevel);
4625 } else if (compress_type != ZIO_COMPRESS_OFF) {
4626 ASSERT3U(type, ==, ARC_BUFC_DATA);
4627 *datap = arc_alloc_compressed_buf(os->os_spa, db,
4628 psize, lsize, compress_type, complevel);
4629 } else {
4630 *datap = arc_alloc_buf(os->os_spa, db, type, psize);
4631 }
4632 memcpy((*datap)->b_data, db->db.db_data, psize);
4633 }
4634 db->db_data_pending = dr;
4635
4636 mutex_exit(&db->db_mtx);
4637
4638 dbuf_write(dr, *datap, tx);
4639
4640 ASSERT(!list_link_active(&dr->dr_dirty_node));
4641 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
4642 list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
4643 } else {
4644 zio_nowait(dr->dr_zio);
4645 }
4646 }
4647
4648 /*
4649 * Syncs out a range of dirty records for indirect or leaf dbufs. May be
4650 * called recursively from dbuf_sync_indirect().
4651 */
4652 void
dbuf_sync_list(list_t * list,int level,dmu_tx_t * tx)4653 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
4654 {
4655 dbuf_dirty_record_t *dr;
4656
4657 while ((dr = list_head(list))) {
4658 if (dr->dr_zio != NULL) {
4659 /*
4660 * If we find an already initialized zio then we
4661 * are processing the meta-dnode, and we have finished.
4662 * The dbufs for all dnodes are put back on the list
4663 * during processing, so that we can zio_wait()
4664 * these IOs after initiating all child IOs.
4665 */
4666 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
4667 DMU_META_DNODE_OBJECT);
4668 break;
4669 }
4670 list_remove(list, dr);
4671 if (dr->dr_dbuf == NULL) {
4672 dbuf_sync_lightweight(dr, tx);
4673 } else {
4674 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
4675 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
4676 VERIFY3U(dr->dr_dbuf->db_level, ==, level);
4677 }
4678 if (dr->dr_dbuf->db_level > 0)
4679 dbuf_sync_indirect(dr, tx);
4680 else
4681 dbuf_sync_leaf(dr, tx);
4682 }
4683 }
4684 }
4685
4686 static void
dbuf_write_ready(zio_t * zio,arc_buf_t * buf,void * vdb)4687 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4688 {
4689 (void) buf;
4690 dmu_buf_impl_t *db = vdb;
4691 dnode_t *dn;
4692 blkptr_t *bp = zio->io_bp;
4693 blkptr_t *bp_orig = &zio->io_bp_orig;
4694 spa_t *spa = zio->io_spa;
4695 int64_t delta;
4696 uint64_t fill = 0;
4697 int i;
4698
4699 ASSERT3P(db->db_blkptr, !=, NULL);
4700 ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
4701
4702 DB_DNODE_ENTER(db);
4703 dn = DB_DNODE(db);
4704 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
4705 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
4706 zio->io_prev_space_delta = delta;
4707
4708 if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
4709 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
4710 BP_GET_TYPE(bp) == dn->dn_type) ||
4711 (db->db_blkid == DMU_SPILL_BLKID &&
4712 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
4713 BP_IS_EMBEDDED(bp));
4714 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
4715 }
4716
4717 mutex_enter(&db->db_mtx);
4718
4719 #ifdef ZFS_DEBUG
4720 if (db->db_blkid == DMU_SPILL_BLKID) {
4721 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4722 ASSERT(!(BP_IS_HOLE(bp)) &&
4723 db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4724 }
4725 #endif
4726
4727 if (db->db_level == 0) {
4728 mutex_enter(&dn->dn_mtx);
4729 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
4730 db->db_blkid != DMU_SPILL_BLKID) {
4731 ASSERT0(db->db_objset->os_raw_receive);
4732 dn->dn_phys->dn_maxblkid = db->db_blkid;
4733 }
4734 mutex_exit(&dn->dn_mtx);
4735
4736 if (dn->dn_type == DMU_OT_DNODE) {
4737 i = 0;
4738 while (i < db->db.db_size) {
4739 dnode_phys_t *dnp =
4740 (void *)(((char *)db->db.db_data) + i);
4741
4742 i += DNODE_MIN_SIZE;
4743 if (dnp->dn_type != DMU_OT_NONE) {
4744 fill++;
4745 for (int j = 0; j < dnp->dn_nblkptr;
4746 j++) {
4747 (void) zfs_blkptr_verify(spa,
4748 &dnp->dn_blkptr[j],
4749 BLK_CONFIG_SKIP,
4750 BLK_VERIFY_HALT);
4751 }
4752 if (dnp->dn_flags &
4753 DNODE_FLAG_SPILL_BLKPTR) {
4754 (void) zfs_blkptr_verify(spa,
4755 DN_SPILL_BLKPTR(dnp),
4756 BLK_CONFIG_SKIP,
4757 BLK_VERIFY_HALT);
4758 }
4759 i += dnp->dn_extra_slots *
4760 DNODE_MIN_SIZE;
4761 }
4762 }
4763 } else {
4764 if (BP_IS_HOLE(bp)) {
4765 fill = 0;
4766 } else {
4767 fill = 1;
4768 }
4769 }
4770 } else {
4771 blkptr_t *ibp = db->db.db_data;
4772 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4773 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
4774 if (BP_IS_HOLE(ibp))
4775 continue;
4776 (void) zfs_blkptr_verify(spa, ibp,
4777 BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
4778 fill += BP_GET_FILL(ibp);
4779 }
4780 }
4781 DB_DNODE_EXIT(db);
4782
4783 if (!BP_IS_EMBEDDED(bp))
4784 BP_SET_FILL(bp, fill);
4785
4786 mutex_exit(&db->db_mtx);
4787
4788 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
4789 *db->db_blkptr = *bp;
4790 dmu_buf_unlock_parent(db, dblt, FTAG);
4791 }
4792
4793 /*
4794 * This function gets called just prior to running through the compression
4795 * stage of the zio pipeline. If we're an indirect block comprised of only
4796 * holes, then we want this indirect to be compressed away to a hole. In
4797 * order to do that we must zero out any information about the holes that
4798 * this indirect points to prior to before we try to compress it.
4799 */
4800 static void
dbuf_write_children_ready(zio_t * zio,arc_buf_t * buf,void * vdb)4801 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4802 {
4803 (void) zio, (void) buf;
4804 dmu_buf_impl_t *db = vdb;
4805 dnode_t *dn;
4806 blkptr_t *bp;
4807 unsigned int epbs, i;
4808
4809 ASSERT3U(db->db_level, >, 0);
4810 DB_DNODE_ENTER(db);
4811 dn = DB_DNODE(db);
4812 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
4813 ASSERT3U(epbs, <, 31);
4814
4815 /* Determine if all our children are holes */
4816 for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
4817 if (!BP_IS_HOLE(bp))
4818 break;
4819 }
4820
4821 /*
4822 * If all the children are holes, then zero them all out so that
4823 * we may get compressed away.
4824 */
4825 if (i == 1ULL << epbs) {
4826 /*
4827 * We only found holes. Grab the rwlock to prevent
4828 * anybody from reading the blocks we're about to
4829 * zero out.
4830 */
4831 rw_enter(&db->db_rwlock, RW_WRITER);
4832 memset(db->db.db_data, 0, db->db.db_size);
4833 rw_exit(&db->db_rwlock);
4834 }
4835 DB_DNODE_EXIT(db);
4836 }
4837
4838 static void
dbuf_write_done(zio_t * zio,arc_buf_t * buf,void * vdb)4839 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
4840 {
4841 (void) buf;
4842 dmu_buf_impl_t *db = vdb;
4843 blkptr_t *bp_orig = &zio->io_bp_orig;
4844 blkptr_t *bp = db->db_blkptr;
4845 objset_t *os = db->db_objset;
4846 dmu_tx_t *tx = os->os_synctx;
4847
4848 ASSERT0(zio->io_error);
4849 ASSERT(db->db_blkptr == bp);
4850
4851 /*
4852 * For nopwrites and rewrites we ensure that the bp matches our
4853 * original and bypass all the accounting.
4854 */
4855 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
4856 ASSERT(BP_EQUAL(bp, bp_orig));
4857 } else {
4858 dsl_dataset_t *ds = os->os_dsl_dataset;
4859 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
4860 dsl_dataset_block_born(ds, bp, tx);
4861 }
4862
4863 mutex_enter(&db->db_mtx);
4864
4865 DBUF_VERIFY(db);
4866
4867 dbuf_dirty_record_t *dr = db->db_data_pending;
4868 dnode_t *dn = dr->dr_dnode;
4869 ASSERT(!list_link_active(&dr->dr_dirty_node));
4870 ASSERT(dr->dr_dbuf == db);
4871 ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
4872 list_remove(&db->db_dirty_records, dr);
4873
4874 #ifdef ZFS_DEBUG
4875 if (db->db_blkid == DMU_SPILL_BLKID) {
4876 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4877 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
4878 db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4879 }
4880 #endif
4881
4882 if (db->db_level == 0) {
4883 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
4884 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
4885 if (dr->dt.dl.dr_data != NULL &&
4886 dr->dt.dl.dr_data != db->db_buf) {
4887 arc_buf_destroy(dr->dt.dl.dr_data, db);
4888 }
4889 } else {
4890 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
4891 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
4892 if (!BP_IS_HOLE(db->db_blkptr)) {
4893 int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
4894 SPA_BLKPTRSHIFT;
4895 ASSERT3U(db->db_blkid, <=,
4896 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
4897 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
4898 db->db.db_size);
4899 }
4900 mutex_destroy(&dr->dt.di.dr_mtx);
4901 list_destroy(&dr->dt.di.dr_children);
4902 }
4903
4904 cv_broadcast(&db->db_changed);
4905 ASSERT(db->db_dirtycnt > 0);
4906 db->db_dirtycnt -= 1;
4907 db->db_data_pending = NULL;
4908 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
4909
4910 dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
4911 zio->io_txg);
4912
4913 kmem_free(dr, sizeof (dbuf_dirty_record_t));
4914 }
4915
4916 static void
dbuf_write_nofill_ready(zio_t * zio)4917 dbuf_write_nofill_ready(zio_t *zio)
4918 {
4919 dbuf_write_ready(zio, NULL, zio->io_private);
4920 }
4921
4922 static void
dbuf_write_nofill_done(zio_t * zio)4923 dbuf_write_nofill_done(zio_t *zio)
4924 {
4925 dbuf_write_done(zio, NULL, zio->io_private);
4926 }
4927
4928 static void
dbuf_write_override_ready(zio_t * zio)4929 dbuf_write_override_ready(zio_t *zio)
4930 {
4931 dbuf_dirty_record_t *dr = zio->io_private;
4932 dmu_buf_impl_t *db = dr->dr_dbuf;
4933
4934 dbuf_write_ready(zio, NULL, db);
4935 }
4936
4937 static void
dbuf_write_override_done(zio_t * zio)4938 dbuf_write_override_done(zio_t *zio)
4939 {
4940 dbuf_dirty_record_t *dr = zio->io_private;
4941 dmu_buf_impl_t *db = dr->dr_dbuf;
4942 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
4943
4944 mutex_enter(&db->db_mtx);
4945 if (!BP_EQUAL(zio->io_bp, obp)) {
4946 if (!BP_IS_HOLE(obp))
4947 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
4948 arc_release(dr->dt.dl.dr_data, db);
4949 }
4950 mutex_exit(&db->db_mtx);
4951
4952 dbuf_write_done(zio, NULL, db);
4953
4954 if (zio->io_abd != NULL)
4955 abd_free(zio->io_abd);
4956 }
4957
4958 typedef struct dbuf_remap_impl_callback_arg {
4959 objset_t *drica_os;
4960 uint64_t drica_blk_birth;
4961 dmu_tx_t *drica_tx;
4962 } dbuf_remap_impl_callback_arg_t;
4963
4964 static void
dbuf_remap_impl_callback(uint64_t vdev,uint64_t offset,uint64_t size,void * arg)4965 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
4966 void *arg)
4967 {
4968 dbuf_remap_impl_callback_arg_t *drica = arg;
4969 objset_t *os = drica->drica_os;
4970 spa_t *spa = dmu_objset_spa(os);
4971 dmu_tx_t *tx = drica->drica_tx;
4972
4973 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4974
4975 if (os == spa_meta_objset(spa)) {
4976 spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
4977 } else {
4978 dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
4979 size, drica->drica_blk_birth, tx);
4980 }
4981 }
4982
4983 static void
dbuf_remap_impl(dnode_t * dn,blkptr_t * bp,krwlock_t * rw,dmu_tx_t * tx)4984 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
4985 {
4986 blkptr_t bp_copy = *bp;
4987 spa_t *spa = dmu_objset_spa(dn->dn_objset);
4988 dbuf_remap_impl_callback_arg_t drica;
4989
4990 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4991
4992 drica.drica_os = dn->dn_objset;
4993 drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
4994 drica.drica_tx = tx;
4995 if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
4996 &drica)) {
4997 /*
4998 * If the blkptr being remapped is tracked by a livelist,
4999 * then we need to make sure the livelist reflects the update.
5000 * First, cancel out the old blkptr by appending a 'FREE'
5001 * entry. Next, add an 'ALLOC' to track the new version. This
5002 * way we avoid trying to free an inaccurate blkptr at delete.
5003 * Note that embedded blkptrs are not tracked in livelists.
5004 */
5005 if (dn->dn_objset != spa_meta_objset(spa)) {
5006 dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
5007 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
5008 BP_GET_LOGICAL_BIRTH(bp) >
5009 ds->ds_dir->dd_origin_txg) {
5010 ASSERT(!BP_IS_EMBEDDED(bp));
5011 ASSERT(dsl_dir_is_clone(ds->ds_dir));
5012 ASSERT(spa_feature_is_enabled(spa,
5013 SPA_FEATURE_LIVELIST));
5014 bplist_append(&ds->ds_dir->dd_pending_frees,
5015 bp);
5016 bplist_append(&ds->ds_dir->dd_pending_allocs,
5017 &bp_copy);
5018 }
5019 }
5020
5021 /*
5022 * The db_rwlock prevents dbuf_read_impl() from
5023 * dereferencing the BP while we are changing it. To
5024 * avoid lock contention, only grab it when we are actually
5025 * changing the BP.
5026 */
5027 if (rw != NULL)
5028 rw_enter(rw, RW_WRITER);
5029 *bp = bp_copy;
5030 if (rw != NULL)
5031 rw_exit(rw);
5032 }
5033 }
5034
5035 /*
5036 * Remap any existing BP's to concrete vdevs, if possible.
5037 */
5038 static void
dbuf_remap(dnode_t * dn,dmu_buf_impl_t * db,dmu_tx_t * tx)5039 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
5040 {
5041 spa_t *spa = dmu_objset_spa(db->db_objset);
5042 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
5043
5044 if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
5045 return;
5046
5047 if (db->db_level > 0) {
5048 blkptr_t *bp = db->db.db_data;
5049 for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
5050 dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
5051 }
5052 } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
5053 dnode_phys_t *dnp = db->db.db_data;
5054 ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
5055 DMU_OT_DNODE);
5056 for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
5057 i += dnp[i].dn_extra_slots + 1) {
5058 for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
5059 krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
5060 &dn->dn_dbuf->db_rwlock);
5061 dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
5062 tx);
5063 }
5064 }
5065 }
5066 }
5067
5068
5069 /*
5070 * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
5071 * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
5072 */
5073 static void
dbuf_write(dbuf_dirty_record_t * dr,arc_buf_t * data,dmu_tx_t * tx)5074 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
5075 {
5076 dmu_buf_impl_t *db = dr->dr_dbuf;
5077 dnode_t *dn = dr->dr_dnode;
5078 objset_t *os;
5079 dmu_buf_impl_t *parent = db->db_parent;
5080 uint64_t txg = tx->tx_txg;
5081 zbookmark_phys_t zb;
5082 zio_prop_t zp;
5083 zio_t *pio; /* parent I/O */
5084 int wp_flag = 0;
5085
5086 ASSERT(dmu_tx_is_syncing(tx));
5087
5088 os = dn->dn_objset;
5089
5090 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
5091 /*
5092 * Private object buffers are released here rather than in
5093 * dbuf_dirty() since they are only modified in the syncing
5094 * context and we don't want the overhead of making multiple
5095 * copies of the data.
5096 */
5097 if (BP_IS_HOLE(db->db_blkptr))
5098 arc_buf_thaw(data);
5099 else
5100 dbuf_release_bp(db);
5101 dbuf_remap(dn, db, tx);
5102 }
5103
5104 if (parent != dn->dn_dbuf) {
5105 /* Our parent is an indirect block. */
5106 /* We have a dirty parent that has been scheduled for write. */
5107 ASSERT(parent && parent->db_data_pending);
5108 /* Our parent's buffer is one level closer to the dnode. */
5109 ASSERT(db->db_level == parent->db_level-1);
5110 /*
5111 * We're about to modify our parent's db_data by modifying
5112 * our block pointer, so the parent must be released.
5113 */
5114 ASSERT(arc_released(parent->db_buf));
5115 pio = parent->db_data_pending->dr_zio;
5116 } else {
5117 /* Our parent is the dnode itself. */
5118 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
5119 db->db_blkid != DMU_SPILL_BLKID) ||
5120 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
5121 if (db->db_blkid != DMU_SPILL_BLKID)
5122 ASSERT3P(db->db_blkptr, ==,
5123 &dn->dn_phys->dn_blkptr[db->db_blkid]);
5124 pio = dn->dn_zio;
5125 }
5126
5127 ASSERT(db->db_level == 0 || data == db->db_buf);
5128 ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
5129 ASSERT(pio);
5130
5131 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
5132 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
5133 db->db.db_object, db->db_level, db->db_blkid);
5134
5135 if (db->db_blkid == DMU_SPILL_BLKID)
5136 wp_flag = WP_SPILL;
5137 wp_flag |= (data == NULL) ? WP_NOFILL : 0;
5138
5139 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
5140
5141 /*
5142 * We copy the blkptr now (rather than when we instantiate the dirty
5143 * record), because its value can change between open context and
5144 * syncing context. We do not need to hold dn_struct_rwlock to read
5145 * db_blkptr because we are in syncing context.
5146 */
5147 dr->dr_bp_copy = *db->db_blkptr;
5148
5149 if (db->db_level == 0 &&
5150 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
5151 /*
5152 * The BP for this block has been provided by open context
5153 * (by dmu_sync() or dmu_buf_write_embedded()).
5154 */
5155 abd_t *contents = (data != NULL) ?
5156 abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
5157
5158 dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
5159 contents, db->db.db_size, db->db.db_size, &zp,
5160 dbuf_write_override_ready, NULL,
5161 dbuf_write_override_done,
5162 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
5163 mutex_enter(&db->db_mtx);
5164 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
5165 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
5166 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
5167 dr->dt.dl.dr_brtwrite);
5168 mutex_exit(&db->db_mtx);
5169 } else if (data == NULL) {
5170 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
5171 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
5172 dr->dr_zio = zio_write(pio, os->os_spa, txg,
5173 &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
5174 dbuf_write_nofill_ready, NULL,
5175 dbuf_write_nofill_done, db,
5176 ZIO_PRIORITY_ASYNC_WRITE,
5177 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
5178 } else {
5179 ASSERT(arc_released(data));
5180
5181 /*
5182 * For indirect blocks, we want to setup the children
5183 * ready callback so that we can properly handle an indirect
5184 * block that only contains holes.
5185 */
5186 arc_write_done_func_t *children_ready_cb = NULL;
5187 if (db->db_level != 0)
5188 children_ready_cb = dbuf_write_children_ready;
5189
5190 dr->dr_zio = arc_write(pio, os->os_spa, txg,
5191 &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
5192 dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
5193 children_ready_cb, dbuf_write_done, db,
5194 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
5195 }
5196 }
5197
5198 EXPORT_SYMBOL(dbuf_find);
5199 EXPORT_SYMBOL(dbuf_is_metadata);
5200 EXPORT_SYMBOL(dbuf_destroy);
5201 EXPORT_SYMBOL(dbuf_loan_arcbuf);
5202 EXPORT_SYMBOL(dbuf_whichblock);
5203 EXPORT_SYMBOL(dbuf_read);
5204 EXPORT_SYMBOL(dbuf_unoverride);
5205 EXPORT_SYMBOL(dbuf_free_range);
5206 EXPORT_SYMBOL(dbuf_new_size);
5207 EXPORT_SYMBOL(dbuf_release_bp);
5208 EXPORT_SYMBOL(dbuf_dirty);
5209 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
5210 EXPORT_SYMBOL(dmu_buf_will_dirty);
5211 EXPORT_SYMBOL(dmu_buf_is_dirty);
5212 EXPORT_SYMBOL(dmu_buf_will_clone);
5213 EXPORT_SYMBOL(dmu_buf_will_not_fill);
5214 EXPORT_SYMBOL(dmu_buf_will_fill);
5215 EXPORT_SYMBOL(dmu_buf_fill_done);
5216 EXPORT_SYMBOL(dmu_buf_rele);
5217 EXPORT_SYMBOL(dbuf_assign_arcbuf);
5218 EXPORT_SYMBOL(dbuf_prefetch);
5219 EXPORT_SYMBOL(dbuf_hold_impl);
5220 EXPORT_SYMBOL(dbuf_hold);
5221 EXPORT_SYMBOL(dbuf_hold_level);
5222 EXPORT_SYMBOL(dbuf_create_bonus);
5223 EXPORT_SYMBOL(dbuf_spill_set_blksz);
5224 EXPORT_SYMBOL(dbuf_rm_spill);
5225 EXPORT_SYMBOL(dbuf_add_ref);
5226 EXPORT_SYMBOL(dbuf_rele);
5227 EXPORT_SYMBOL(dbuf_rele_and_unlock);
5228 EXPORT_SYMBOL(dbuf_refcount);
5229 EXPORT_SYMBOL(dbuf_sync_list);
5230 EXPORT_SYMBOL(dmu_buf_set_user);
5231 EXPORT_SYMBOL(dmu_buf_set_user_ie);
5232 EXPORT_SYMBOL(dmu_buf_get_user);
5233 EXPORT_SYMBOL(dmu_buf_get_blkptr);
5234
5235 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
5236 "Maximum size in bytes of the dbuf cache.");
5237
5238 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
5239 "Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
5240
5241 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
5242 "Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
5243
5244 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
5245 "Maximum size in bytes of dbuf metadata cache.");
5246
5247 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
5248 "Set size of dbuf cache to log2 fraction of arc size.");
5249
5250 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
5251 "Set size of dbuf metadata cache to log2 fraction of arc size.");
5252
5253 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
5254 "Set size of dbuf cache mutex array as log2 shift.");
5255