1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, Delphix. All rights reserved. 24 * Copyright (c) 2013, Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2020, George Amanakis. All rights reserved. 27 */ 28 29 #ifndef _SYS_ARC_IMPL_H 30 #define _SYS_ARC_IMPL_H 31 32 #include <sys/arc.h> 33 #include <sys/zio_crypt.h> 34 #include <sys/zthr.h> 35 #include <sys/aggsum.h> 36 37 #ifdef __cplusplus 38 extern "C" { 39 #endif 40 41 /* 42 * Note that buffers can be in one of 6 states: 43 * ARC_anon - anonymous (discussed below) 44 * ARC_mru - recently used, currently cached 45 * ARC_mru_ghost - recently used, no longer in cache 46 * ARC_mfu - frequently used, currently cached 47 * ARC_mfu_ghost - frequently used, no longer in cache 48 * ARC_l2c_only - exists in L2ARC but not other states 49 * When there are no active references to the buffer, they are 50 * are linked onto a list in one of these arc states. These are 51 * the only buffers that can be evicted or deleted. Within each 52 * state there are multiple lists, one for meta-data and one for 53 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 54 * etc.) is tracked separately so that it can be managed more 55 * explicitly: favored over data, limited explicitly. 56 * 57 * Anonymous buffers are buffers that are not associated with 58 * a DVA. These are buffers that hold dirty block copies 59 * before they are written to stable storage. By definition, 60 * they are "ref'd" and are considered part of arc_mru 61 * that cannot be freed. Generally, they will acquire a DVA 62 * as they are written and migrate onto the arc_mru list. 63 * 64 * The ARC_l2c_only state is for buffers that are in the second 65 * level ARC but no longer in any of the ARC_m* lists. The second 66 * level ARC itself may also contain buffers that are in any of 67 * the ARC_m* states - meaning that a buffer can exist in two 68 * places. The reason for the ARC_l2c_only state is to keep the 69 * buffer header in the hash table, so that reads that hit the 70 * second level ARC benefit from these fast lookups. 71 */ 72 73 typedef struct arc_state { 74 /* 75 * list of evictable buffers 76 */ 77 multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 78 /* 79 * total amount of evictable data in this state 80 */ 81 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 82 /* 83 * total amount of data in this state; this includes: evictable, 84 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 85 */ 86 zfs_refcount_t arcs_size; 87 /* 88 * supports the "dbufs" kstat 89 */ 90 arc_state_type_t arcs_state; 91 } arc_state_t; 92 93 typedef struct arc_callback arc_callback_t; 94 95 struct arc_callback { 96 void *acb_private; 97 arc_read_done_func_t *acb_done; 98 arc_buf_t *acb_buf; 99 boolean_t acb_encrypted; 100 boolean_t acb_compressed; 101 boolean_t acb_noauth; 102 zbookmark_phys_t acb_zb; 103 zio_t *acb_zio_dummy; 104 zio_t *acb_zio_head; 105 arc_callback_t *acb_next; 106 }; 107 108 typedef struct arc_write_callback arc_write_callback_t; 109 110 struct arc_write_callback { 111 void *awcb_private; 112 arc_write_done_func_t *awcb_ready; 113 arc_write_done_func_t *awcb_children_ready; 114 arc_write_done_func_t *awcb_physdone; 115 arc_write_done_func_t *awcb_done; 116 arc_buf_t *awcb_buf; 117 }; 118 119 /* 120 * ARC buffers are separated into multiple structs as a memory saving measure: 121 * - Common fields struct, always defined, and embedded within it: 122 * - L2-only fields, always allocated but undefined when not in L2ARC 123 * - L1-only fields, only allocated when in L1ARC 124 * 125 * Buffer in L1 Buffer only in L2 126 * +------------------------+ +------------------------+ 127 * | arc_buf_hdr_t | | arc_buf_hdr_t | 128 * | | | | 129 * | | | | 130 * | | | | 131 * +------------------------+ +------------------------+ 132 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 133 * | (undefined if L1-only) | | | 134 * +------------------------+ +------------------------+ 135 * | l1arc_buf_hdr_t | 136 * | | 137 * | | 138 * | | 139 * | | 140 * +------------------------+ 141 * 142 * Because it's possible for the L2ARC to become extremely large, we can wind 143 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 144 * is minimized by only allocating the fields necessary for an L1-cached buffer 145 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 146 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 147 * words in pointers. arc_hdr_realloc() is used to switch a header between 148 * these two allocation states. 149 */ 150 typedef struct l1arc_buf_hdr { 151 kmutex_t b_freeze_lock; 152 zio_cksum_t *b_freeze_cksum; 153 154 arc_buf_t *b_buf; 155 uint32_t b_bufcnt; 156 /* for waiting on writes to complete */ 157 kcondvar_t b_cv; 158 uint8_t b_byteswap; 159 160 161 /* protected by arc state mutex */ 162 arc_state_t *b_state; 163 multilist_node_t b_arc_node; 164 165 /* updated atomically */ 166 clock_t b_arc_access; 167 uint32_t b_mru_hits; 168 uint32_t b_mru_ghost_hits; 169 uint32_t b_mfu_hits; 170 uint32_t b_mfu_ghost_hits; 171 uint32_t b_l2_hits; 172 173 /* self protecting */ 174 zfs_refcount_t b_refcnt; 175 176 arc_callback_t *b_acb; 177 abd_t *b_pabd; 178 } l1arc_buf_hdr_t; 179 180 typedef enum l2arc_dev_hdr_flags_t { 181 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ 182 } l2arc_dev_hdr_flags_t; 183 184 /* 185 * Pointer used in persistent L2ARC (for pointing to log blocks). 186 */ 187 typedef struct l2arc_log_blkptr { 188 /* 189 * Offset of log block within the device, in bytes 190 */ 191 uint64_t lbp_daddr; 192 /* 193 * Aligned payload size (in bytes) of the log block 194 */ 195 uint64_t lbp_payload_asize; 196 /* 197 * Offset in bytes of the first buffer in the payload 198 */ 199 uint64_t lbp_payload_start; 200 /* 201 * lbp_prop has the following format: 202 * * logical size (in bytes) 203 * * aligned (after compression) size (in bytes) 204 * * compression algorithm (we always LZ4-compress l2arc logs) 205 * * checksum algorithm (used for lbp_cksum) 206 */ 207 uint64_t lbp_prop; 208 zio_cksum_t lbp_cksum; /* checksum of log */ 209 } l2arc_log_blkptr_t; 210 211 /* 212 * The persistent L2ARC device header. 213 * Byte order of magic determines whether 64-bit bswap of fields is necessary. 214 */ 215 typedef struct l2arc_dev_hdr_phys { 216 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ 217 uint64_t dh_version; /* Persistent L2ARC version */ 218 219 /* 220 * Global L2ARC device state and metadata. 221 */ 222 uint64_t dh_spa_guid; 223 uint64_t dh_vdev_guid; 224 uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ 225 uint64_t dh_evict; /* evicted offset in bytes */ 226 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ 227 /* 228 * Used in zdb.c for determining if a log block is valid, in the same 229 * way that l2arc_rebuild() does. 230 */ 231 uint64_t dh_start; /* mirror of l2ad_start */ 232 uint64_t dh_end; /* mirror of l2ad_end */ 233 /* 234 * Start of log block chain. [0] -> newest log, [1] -> one older (used 235 * for initiating prefetch). 236 */ 237 l2arc_log_blkptr_t dh_start_lbps[2]; 238 /* 239 * Aligned size of all log blocks as accounted by vdev_space_update(). 240 */ 241 uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ 242 uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ 243 /* 244 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to 245 * display when the cache device was fully trimmed for the last 246 * time. 247 */ 248 uint64_t dh_trim_action_time; 249 uint64_t dh_trim_state; 250 const uint64_t dh_pad[30]; /* pad to 512 bytes */ 251 zio_eck_t dh_tail; 252 } l2arc_dev_hdr_phys_t; 253 CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); 254 255 /* 256 * A single ARC buffer header entry in a l2arc_log_blk_phys_t. 257 */ 258 typedef struct l2arc_log_ent_phys { 259 dva_t le_dva; /* dva of buffer */ 260 uint64_t le_birth; /* birth txg of buffer */ 261 /* 262 * le_prop has the following format: 263 * * logical size (in bytes) 264 * * physical (compressed) size (in bytes) 265 * * compression algorithm 266 * * object type (used to restore arc_buf_contents_t) 267 * * protected status (used for encryption) 268 * * prefetch status (used in l2arc_read_done()) 269 */ 270 uint64_t le_prop; 271 uint64_t le_daddr; /* buf location on l2dev */ 272 uint64_t le_complevel; 273 /* 274 * We pad the size of each entry to a power of 2 so that the size of 275 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, 276 * because of the L2ARC_SET_*SIZE macros. 277 */ 278 const uint64_t le_pad[2]; /* pad to 64 bytes */ 279 } l2arc_log_ent_phys_t; 280 281 #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) 282 283 /* 284 * A log block of up to 1022 ARC buffer log entries, chained into the 285 * persistent L2ARC metadata linked list. Byte order of magic determines 286 * whether 64-bit bswap of fields is necessary. 287 */ 288 typedef struct l2arc_log_blk_phys { 289 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ 290 /* 291 * There are 2 chains (headed by dh_start_lbps[2]), and this field 292 * points back to the previous block in this chain. We alternate 293 * which chain we append to, so they are time-wise and offset-wise 294 * interleaved, but that is an optimization rather than for 295 * correctness. 296 */ 297 l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ 298 /* 299 * Pad header section to 128 bytes 300 */ 301 uint64_t lb_pad[7]; 302 /* Payload */ 303 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; 304 } l2arc_log_blk_phys_t; /* 64K total */ 305 306 /* 307 * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with 308 * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. 309 */ 310 CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 311 1ULL << SPA_MINBLOCKSHIFT)); 312 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); 313 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); 314 315 /* 316 * These structures hold in-flight abd buffers for log blocks as they're being 317 * written to the L2ARC device. 318 */ 319 typedef struct l2arc_lb_abd_buf { 320 abd_t *abd; 321 list_node_t node; 322 } l2arc_lb_abd_buf_t; 323 324 /* 325 * These structures hold pointers to log blocks present on the L2ARC device. 326 */ 327 typedef struct l2arc_lb_ptr_buf { 328 l2arc_log_blkptr_t *lb_ptr; 329 list_node_t node; 330 } l2arc_lb_ptr_buf_t; 331 332 /* Macros for setting fields in le_prop and lbp_prop */ 333 #define L2BLK_GET_LSIZE(field) \ 334 BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) 335 #define L2BLK_SET_LSIZE(field, x) \ 336 BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 337 #define L2BLK_GET_PSIZE(field) \ 338 BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) 339 #define L2BLK_SET_PSIZE(field, x) \ 340 BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 341 #define L2BLK_GET_COMPRESS(field) \ 342 BF64_GET((field), 32, SPA_COMPRESSBITS) 343 #define L2BLK_SET_COMPRESS(field, x) \ 344 BF64_SET((field), 32, SPA_COMPRESSBITS, x) 345 #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) 346 #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) 347 #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) 348 #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) 349 #define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) 350 #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) 351 #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) 352 #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) 353 354 #define PTR_SWAP(x, y) \ 355 do { \ 356 void *tmp = (x);\ 357 x = y; \ 358 y = tmp; \ 359 _NOTE(CONSTCOND)\ 360 } while (0) 361 362 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ 363 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ 364 365 /* 366 * L2ARC Internals 367 */ 368 typedef struct l2arc_dev { 369 vdev_t *l2ad_vdev; /* vdev */ 370 spa_t *l2ad_spa; /* spa */ 371 uint64_t l2ad_hand; /* next write location */ 372 uint64_t l2ad_start; /* first addr on device */ 373 uint64_t l2ad_end; /* last addr on device */ 374 boolean_t l2ad_first; /* first sweep through */ 375 boolean_t l2ad_writing; /* currently writing */ 376 kmutex_t l2ad_mtx; /* lock for buffer list */ 377 list_t l2ad_buflist; /* buffer list */ 378 list_node_t l2ad_node; /* device list node */ 379 zfs_refcount_t l2ad_alloc; /* allocated bytes */ 380 /* 381 * Persistence-related stuff 382 */ 383 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ 384 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ 385 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ 386 int l2ad_log_ent_idx; /* index into cur log blk */ 387 /* Number of bytes in current log block's payload */ 388 uint64_t l2ad_log_blk_payload_asize; 389 /* 390 * Offset (in bytes) of the first buffer in current log block's 391 * payload. 392 */ 393 uint64_t l2ad_log_blk_payload_start; 394 /* Flag indicating whether a rebuild is scheduled or is going on */ 395 boolean_t l2ad_rebuild; 396 boolean_t l2ad_rebuild_cancel; 397 boolean_t l2ad_rebuild_began; 398 uint64_t l2ad_log_entries; /* entries per log blk */ 399 uint64_t l2ad_evict; /* evicted offset in bytes */ 400 /* List of pointers to log blocks present in the L2ARC device */ 401 list_t l2ad_lbptr_list; 402 /* 403 * Aligned size of all log blocks as accounted by vdev_space_update(). 404 */ 405 zfs_refcount_t l2ad_lb_asize; 406 /* 407 * Number of log blocks present on the device. 408 */ 409 zfs_refcount_t l2ad_lb_count; 410 boolean_t l2ad_trim_all; /* TRIM whole device */ 411 } l2arc_dev_t; 412 413 /* 414 * Encrypted blocks will need to be stored encrypted on the L2ARC 415 * disk as they appear in the main pool. In order for this to work we 416 * need to pass around the encryption parameters so they can be used 417 * to write data to the L2ARC. This struct is only defined in the 418 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED 419 * flag set. 420 */ 421 typedef struct arc_buf_hdr_crypt { 422 abd_t *b_rabd; /* raw encrypted data */ 423 dmu_object_type_t b_ot; /* object type */ 424 uint32_t b_ebufcnt; /* count of encrypted buffers */ 425 426 /* dsobj for looking up encryption key for l2arc encryption */ 427 uint64_t b_dsobj; 428 429 /* encryption parameters */ 430 uint8_t b_salt[ZIO_DATA_SALT_LEN]; 431 uint8_t b_iv[ZIO_DATA_IV_LEN]; 432 433 /* 434 * Technically this could be removed since we will always be able to 435 * get the mac from the bp when we need it. However, it is inconvenient 436 * for callers of arc code to have to pass a bp in all the time. This 437 * also allows us to assert that L2ARC data is properly encrypted to 438 * match the data in the main storage pool. 439 */ 440 uint8_t b_mac[ZIO_DATA_MAC_LEN]; 441 } arc_buf_hdr_crypt_t; 442 443 typedef struct l2arc_buf_hdr { 444 /* protected by arc_buf_hdr mutex */ 445 l2arc_dev_t *b_dev; /* L2ARC device */ 446 uint64_t b_daddr; /* disk address, offset byte */ 447 uint32_t b_hits; 448 list_node_t b_l2node; 449 } l2arc_buf_hdr_t; 450 451 typedef struct l2arc_write_callback { 452 l2arc_dev_t *l2wcb_dev; /* device info */ 453 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 454 /* in-flight list of log blocks */ 455 list_t l2wcb_abd_list; 456 } l2arc_write_callback_t; 457 458 struct arc_buf_hdr { 459 /* protected by hash lock */ 460 dva_t b_dva; 461 uint64_t b_birth; 462 463 arc_buf_contents_t b_type; 464 uint8_t b_complevel; 465 uint8_t b_reserved1; /* used for 4 byte alignment */ 466 uint16_t b_reserved2; /* used for 4 byte alignment */ 467 arc_buf_hdr_t *b_hash_next; 468 arc_flags_t b_flags; 469 470 /* 471 * This field stores the size of the data buffer after 472 * compression, and is set in the arc's zio completion handlers. 473 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 474 * 475 * While the block pointers can store up to 32MB in their psize 476 * field, we can only store up to 32MB minus 512B. This is due 477 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 478 * a field of zeros represents 512B in the bp). We can't use a 479 * bias of 1 since we need to reserve a psize of zero, here, to 480 * represent holes and embedded blocks. 481 * 482 * This isn't a problem in practice, since the maximum size of a 483 * buffer is limited to 16MB, so we never need to store 32MB in 484 * this field. Even in the upstream illumos code base, the 485 * maximum size of a buffer is limited to 16MB. 486 */ 487 uint16_t b_psize; 488 489 /* 490 * This field stores the size of the data buffer before 491 * compression, and cannot change once set. It is in units 492 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 493 */ 494 uint16_t b_lsize; /* immutable */ 495 uint64_t b_spa; /* immutable */ 496 497 /* L2ARC fields. Undefined when not in L2ARC. */ 498 l2arc_buf_hdr_t b_l2hdr; 499 /* L1ARC fields. Undefined when in l2arc_only state */ 500 l1arc_buf_hdr_t b_l1hdr; 501 /* 502 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED 503 * is set and the L1 header exists. 504 */ 505 arc_buf_hdr_crypt_t b_crypt_hdr; 506 }; 507 508 typedef struct arc_stats { 509 kstat_named_t arcstat_hits; 510 kstat_named_t arcstat_misses; 511 kstat_named_t arcstat_demand_data_hits; 512 kstat_named_t arcstat_demand_data_misses; 513 kstat_named_t arcstat_demand_metadata_hits; 514 kstat_named_t arcstat_demand_metadata_misses; 515 kstat_named_t arcstat_prefetch_data_hits; 516 kstat_named_t arcstat_prefetch_data_misses; 517 kstat_named_t arcstat_prefetch_metadata_hits; 518 kstat_named_t arcstat_prefetch_metadata_misses; 519 kstat_named_t arcstat_mru_hits; 520 kstat_named_t arcstat_mru_ghost_hits; 521 kstat_named_t arcstat_mfu_hits; 522 kstat_named_t arcstat_mfu_ghost_hits; 523 kstat_named_t arcstat_deleted; 524 /* 525 * Number of buffers that could not be evicted because the hash lock 526 * was held by another thread. The lock may not necessarily be held 527 * by something using the same buffer, since hash locks are shared 528 * by multiple buffers. 529 */ 530 kstat_named_t arcstat_mutex_miss; 531 /* 532 * Number of buffers skipped when updating the access state due to the 533 * header having already been released after acquiring the hash lock. 534 */ 535 kstat_named_t arcstat_access_skip; 536 /* 537 * Number of buffers skipped because they have I/O in progress, are 538 * indirect prefetch buffers that have not lived long enough, or are 539 * not from the spa we're trying to evict from. 540 */ 541 kstat_named_t arcstat_evict_skip; 542 /* 543 * Number of times arc_evict_state() was unable to evict enough 544 * buffers to reach its target amount. 545 */ 546 kstat_named_t arcstat_evict_not_enough; 547 kstat_named_t arcstat_evict_l2_cached; 548 kstat_named_t arcstat_evict_l2_eligible; 549 kstat_named_t arcstat_evict_l2_ineligible; 550 kstat_named_t arcstat_evict_l2_skip; 551 kstat_named_t arcstat_hash_elements; 552 kstat_named_t arcstat_hash_elements_max; 553 kstat_named_t arcstat_hash_collisions; 554 kstat_named_t arcstat_hash_chains; 555 kstat_named_t arcstat_hash_chain_max; 556 kstat_named_t arcstat_p; 557 kstat_named_t arcstat_c; 558 kstat_named_t arcstat_c_min; 559 kstat_named_t arcstat_c_max; 560 /* Not updated directly; only synced in arc_kstat_update. */ 561 kstat_named_t arcstat_size; 562 /* 563 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 564 * Note that the compressed bytes may match the uncompressed bytes 565 * if the block is either not compressed or compressed arc is disabled. 566 */ 567 kstat_named_t arcstat_compressed_size; 568 /* 569 * Uncompressed size of the data stored in b_pabd. If compressed 570 * arc is disabled then this value will be identical to the stat 571 * above. 572 */ 573 kstat_named_t arcstat_uncompressed_size; 574 /* 575 * Number of bytes stored in all the arc_buf_t's. This is classified 576 * as "overhead" since this data is typically short-lived and will 577 * be evicted from the arc when it becomes unreferenced unless the 578 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 579 * values have been set (see comment in dbuf.c for more information). 580 */ 581 kstat_named_t arcstat_overhead_size; 582 /* 583 * Number of bytes consumed by internal ARC structures necessary 584 * for tracking purposes; these structures are not actually 585 * backed by ARC buffers. This includes arc_buf_hdr_t structures 586 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 587 * caches), and arc_buf_t structures (allocated via arc_buf_t 588 * cache). 589 * Not updated directly; only synced in arc_kstat_update. 590 */ 591 kstat_named_t arcstat_hdr_size; 592 /* 593 * Number of bytes consumed by ARC buffers of type equal to 594 * ARC_BUFC_DATA. This is generally consumed by buffers backing 595 * on disk user data (e.g. plain file contents). 596 * Not updated directly; only synced in arc_kstat_update. 597 */ 598 kstat_named_t arcstat_data_size; 599 /* 600 * Number of bytes consumed by ARC buffers of type equal to 601 * ARC_BUFC_METADATA. This is generally consumed by buffers 602 * backing on disk data that is used for internal ZFS 603 * structures (e.g. ZAP, dnode, indirect blocks, etc). 604 * Not updated directly; only synced in arc_kstat_update. 605 */ 606 kstat_named_t arcstat_metadata_size; 607 /* 608 * Number of bytes consumed by dmu_buf_impl_t objects. 609 * Not updated directly; only synced in arc_kstat_update. 610 */ 611 kstat_named_t arcstat_dbuf_size; 612 /* 613 * Number of bytes consumed by dnode_t objects. 614 * Not updated directly; only synced in arc_kstat_update. 615 */ 616 kstat_named_t arcstat_dnode_size; 617 /* 618 * Number of bytes consumed by bonus buffers. 619 * Not updated directly; only synced in arc_kstat_update. 620 */ 621 kstat_named_t arcstat_bonus_size; 622 #if defined(COMPAT_FREEBSD11) 623 /* 624 * Sum of the previous three counters, provided for compatibility. 625 */ 626 kstat_named_t arcstat_other_size; 627 #endif 628 629 /* 630 * Total number of bytes consumed by ARC buffers residing in the 631 * arc_anon state. This includes *all* buffers in the arc_anon 632 * state; e.g. data, metadata, evictable, and unevictable buffers 633 * are all included in this value. 634 * Not updated directly; only synced in arc_kstat_update. 635 */ 636 kstat_named_t arcstat_anon_size; 637 /* 638 * Number of bytes consumed by ARC buffers that meet the 639 * following criteria: backing buffers of type ARC_BUFC_DATA, 640 * residing in the arc_anon state, and are eligible for eviction 641 * (e.g. have no outstanding holds on the buffer). 642 * Not updated directly; only synced in arc_kstat_update. 643 */ 644 kstat_named_t arcstat_anon_evictable_data; 645 /* 646 * Number of bytes consumed by ARC buffers that meet the 647 * following criteria: backing buffers of type ARC_BUFC_METADATA, 648 * residing in the arc_anon state, and are eligible for eviction 649 * (e.g. have no outstanding holds on the buffer). 650 * Not updated directly; only synced in arc_kstat_update. 651 */ 652 kstat_named_t arcstat_anon_evictable_metadata; 653 /* 654 * Total number of bytes consumed by ARC buffers residing in the 655 * arc_mru state. This includes *all* buffers in the arc_mru 656 * state; e.g. data, metadata, evictable, and unevictable buffers 657 * are all included in this value. 658 * Not updated directly; only synced in arc_kstat_update. 659 */ 660 kstat_named_t arcstat_mru_size; 661 /* 662 * Number of bytes consumed by ARC buffers that meet the 663 * following criteria: backing buffers of type ARC_BUFC_DATA, 664 * residing in the arc_mru state, and are eligible for eviction 665 * (e.g. have no outstanding holds on the buffer). 666 * Not updated directly; only synced in arc_kstat_update. 667 */ 668 kstat_named_t arcstat_mru_evictable_data; 669 /* 670 * Number of bytes consumed by ARC buffers that meet the 671 * following criteria: backing buffers of type ARC_BUFC_METADATA, 672 * residing in the arc_mru state, and are eligible for eviction 673 * (e.g. have no outstanding holds on the buffer). 674 * Not updated directly; only synced in arc_kstat_update. 675 */ 676 kstat_named_t arcstat_mru_evictable_metadata; 677 /* 678 * Total number of bytes that *would have been* consumed by ARC 679 * buffers in the arc_mru_ghost state. The key thing to note 680 * here, is the fact that this size doesn't actually indicate 681 * RAM consumption. The ghost lists only consist of headers and 682 * don't actually have ARC buffers linked off of these headers. 683 * Thus, *if* the headers had associated ARC buffers, these 684 * buffers *would have* consumed this number of bytes. 685 * Not updated directly; only synced in arc_kstat_update. 686 */ 687 kstat_named_t arcstat_mru_ghost_size; 688 /* 689 * Number of bytes that *would have been* consumed by ARC 690 * buffers that are eligible for eviction, of type 691 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 692 * Not updated directly; only synced in arc_kstat_update. 693 */ 694 kstat_named_t arcstat_mru_ghost_evictable_data; 695 /* 696 * Number of bytes that *would have been* consumed by ARC 697 * buffers that are eligible for eviction, of type 698 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 699 * Not updated directly; only synced in arc_kstat_update. 700 */ 701 kstat_named_t arcstat_mru_ghost_evictable_metadata; 702 /* 703 * Total number of bytes consumed by ARC buffers residing in the 704 * arc_mfu state. This includes *all* buffers in the arc_mfu 705 * state; e.g. data, metadata, evictable, and unevictable buffers 706 * are all included in this value. 707 * Not updated directly; only synced in arc_kstat_update. 708 */ 709 kstat_named_t arcstat_mfu_size; 710 /* 711 * Number of bytes consumed by ARC buffers that are eligible for 712 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 713 * state. 714 * Not updated directly; only synced in arc_kstat_update. 715 */ 716 kstat_named_t arcstat_mfu_evictable_data; 717 /* 718 * Number of bytes consumed by ARC buffers that are eligible for 719 * eviction, of type ARC_BUFC_METADATA, and reside in the 720 * arc_mfu state. 721 * Not updated directly; only synced in arc_kstat_update. 722 */ 723 kstat_named_t arcstat_mfu_evictable_metadata; 724 /* 725 * Total number of bytes that *would have been* consumed by ARC 726 * buffers in the arc_mfu_ghost state. See the comment above 727 * arcstat_mru_ghost_size for more details. 728 * Not updated directly; only synced in arc_kstat_update. 729 */ 730 kstat_named_t arcstat_mfu_ghost_size; 731 /* 732 * Number of bytes that *would have been* consumed by ARC 733 * buffers that are eligible for eviction, of type 734 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 735 * Not updated directly; only synced in arc_kstat_update. 736 */ 737 kstat_named_t arcstat_mfu_ghost_evictable_data; 738 /* 739 * Number of bytes that *would have been* consumed by ARC 740 * buffers that are eligible for eviction, of type 741 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 742 * Not updated directly; only synced in arc_kstat_update. 743 */ 744 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 745 kstat_named_t arcstat_l2_hits; 746 kstat_named_t arcstat_l2_misses; 747 kstat_named_t arcstat_l2_feeds; 748 kstat_named_t arcstat_l2_rw_clash; 749 kstat_named_t arcstat_l2_read_bytes; 750 kstat_named_t arcstat_l2_write_bytes; 751 kstat_named_t arcstat_l2_writes_sent; 752 kstat_named_t arcstat_l2_writes_done; 753 kstat_named_t arcstat_l2_writes_error; 754 kstat_named_t arcstat_l2_writes_lock_retry; 755 kstat_named_t arcstat_l2_evict_lock_retry; 756 kstat_named_t arcstat_l2_evict_reading; 757 kstat_named_t arcstat_l2_evict_l1cached; 758 kstat_named_t arcstat_l2_free_on_write; 759 kstat_named_t arcstat_l2_abort_lowmem; 760 kstat_named_t arcstat_l2_cksum_bad; 761 kstat_named_t arcstat_l2_io_error; 762 kstat_named_t arcstat_l2_lsize; 763 kstat_named_t arcstat_l2_psize; 764 /* Not updated directly; only synced in arc_kstat_update. */ 765 kstat_named_t arcstat_l2_hdr_size; 766 /* 767 * Number of L2ARC log blocks written. These are used for restoring the 768 * L2ARC. Updated during writing of L2ARC log blocks. 769 */ 770 kstat_named_t arcstat_l2_log_blk_writes; 771 /* 772 * Moving average of the aligned size of the L2ARC log blocks, in 773 * bytes. Updated during L2ARC rebuild and during writing of L2ARC 774 * log blocks. 775 */ 776 kstat_named_t arcstat_l2_log_blk_avg_asize; 777 /* Aligned size of L2ARC log blocks on L2ARC devices. */ 778 kstat_named_t arcstat_l2_log_blk_asize; 779 /* Number of L2ARC log blocks present on L2ARC devices. */ 780 kstat_named_t arcstat_l2_log_blk_count; 781 /* 782 * Moving average of the aligned size of L2ARC restored data, in bytes, 783 * to the aligned size of their metadata in L2ARC, in bytes. 784 * Updated during L2ARC rebuild and during writing of L2ARC log blocks. 785 */ 786 kstat_named_t arcstat_l2_data_to_meta_ratio; 787 /* 788 * Number of times the L2ARC rebuild was successful for an L2ARC device. 789 */ 790 kstat_named_t arcstat_l2_rebuild_success; 791 /* 792 * Number of times the L2ARC rebuild failed because the device header 793 * was in an unsupported format or corrupted. 794 */ 795 kstat_named_t arcstat_l2_rebuild_abort_unsupported; 796 /* 797 * Number of times the L2ARC rebuild failed because of IO errors 798 * while reading a log block. 799 */ 800 kstat_named_t arcstat_l2_rebuild_abort_io_errors; 801 /* 802 * Number of times the L2ARC rebuild failed because of IO errors when 803 * reading the device header. 804 */ 805 kstat_named_t arcstat_l2_rebuild_abort_dh_errors; 806 /* 807 * Number of L2ARC log blocks which failed to be restored due to 808 * checksum errors. 809 */ 810 kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; 811 /* 812 * Number of times the L2ARC rebuild was aborted due to low system 813 * memory. 814 */ 815 kstat_named_t arcstat_l2_rebuild_abort_lowmem; 816 /* Logical size of L2ARC restored data, in bytes. */ 817 kstat_named_t arcstat_l2_rebuild_size; 818 /* Aligned size of L2ARC restored data, in bytes. */ 819 kstat_named_t arcstat_l2_rebuild_asize; 820 /* 821 * Number of L2ARC log entries (buffers) that were successfully 822 * restored in ARC. 823 */ 824 kstat_named_t arcstat_l2_rebuild_bufs; 825 /* 826 * Number of L2ARC log entries (buffers) already cached in ARC. These 827 * were not restored again. 828 */ 829 kstat_named_t arcstat_l2_rebuild_bufs_precached; 830 /* 831 * Number of L2ARC log blocks that were restored successfully. Each 832 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. 833 */ 834 kstat_named_t arcstat_l2_rebuild_log_blks; 835 kstat_named_t arcstat_memory_throttle_count; 836 kstat_named_t arcstat_memory_direct_count; 837 kstat_named_t arcstat_memory_indirect_count; 838 kstat_named_t arcstat_memory_all_bytes; 839 kstat_named_t arcstat_memory_free_bytes; 840 kstat_named_t arcstat_memory_available_bytes; 841 kstat_named_t arcstat_no_grow; 842 kstat_named_t arcstat_tempreserve; 843 kstat_named_t arcstat_loaned_bytes; 844 kstat_named_t arcstat_prune; 845 /* Not updated directly; only synced in arc_kstat_update. */ 846 kstat_named_t arcstat_meta_used; 847 kstat_named_t arcstat_meta_limit; 848 kstat_named_t arcstat_dnode_limit; 849 kstat_named_t arcstat_meta_max; 850 kstat_named_t arcstat_meta_min; 851 kstat_named_t arcstat_async_upgrade_sync; 852 kstat_named_t arcstat_demand_hit_predictive_prefetch; 853 kstat_named_t arcstat_demand_hit_prescient_prefetch; 854 kstat_named_t arcstat_need_free; 855 kstat_named_t arcstat_sys_free; 856 kstat_named_t arcstat_raw_size; 857 kstat_named_t arcstat_cached_only_in_progress; 858 kstat_named_t arcstat_abd_chunk_waste_size; 859 } arc_stats_t; 860 861 typedef struct arc_evict_waiter { 862 list_node_t aew_node; 863 kcondvar_t aew_cv; 864 uint64_t aew_count; 865 } arc_evict_waiter_t; 866 867 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 868 869 #define ARCSTAT_INCR(stat, val) \ 870 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 871 872 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 873 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 874 875 #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ 876 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 877 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 878 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 879 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 880 #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ 881 882 extern taskq_t *arc_prune_taskq; 883 extern arc_stats_t arc_stats; 884 extern hrtime_t arc_growtime; 885 extern boolean_t arc_warm; 886 extern int arc_grow_retry; 887 extern int arc_no_grow_shift; 888 extern int arc_shrink_shift; 889 extern kmutex_t arc_prune_mtx; 890 extern list_t arc_prune_list; 891 extern aggsum_t arc_size; 892 extern arc_state_t *arc_mfu; 893 extern arc_state_t *arc_mru; 894 extern uint_t zfs_arc_pc_percent; 895 extern int arc_lotsfree_percent; 896 extern unsigned long zfs_arc_min; 897 extern unsigned long zfs_arc_max; 898 899 extern void arc_reduce_target_size(int64_t to_free); 900 extern boolean_t arc_reclaim_needed(void); 901 extern void arc_kmem_reap_soon(void); 902 extern boolean_t arc_is_overflowing(void); 903 extern void arc_wait_for_eviction(uint64_t); 904 905 extern void arc_lowmem_init(void); 906 extern void arc_lowmem_fini(void); 907 extern void arc_prune_async(int64_t); 908 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); 909 extern uint64_t arc_free_memory(void); 910 extern int64_t arc_available_memory(void); 911 extern void arc_tuning_update(boolean_t); 912 913 extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); 914 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); 915 916 /* used in zdb.c */ 917 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, 918 const l2arc_log_blkptr_t *lbp); 919 920 /* used in vdev_trim.c */ 921 void l2arc_dev_hdr_update(l2arc_dev_t *dev); 922 l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); 923 924 #ifdef __cplusplus 925 } 926 #endif 927 928 #endif /* _SYS_ARC_IMPL_H */ 929