1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, Delphix. All rights reserved. 24 * Copyright (c) 2013, Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2020, George Amanakis. All rights reserved. 27 */ 28 29 #ifndef _SYS_ARC_IMPL_H 30 #define _SYS_ARC_IMPL_H 31 32 #include <sys/arc.h> 33 #include <sys/zio_crypt.h> 34 #include <sys/zthr.h> 35 #include <sys/aggsum.h> 36 #include <sys/wmsum.h> 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 /* 43 * Note that buffers can be in one of 6 states: 44 * ARC_anon - anonymous (discussed below) 45 * ARC_mru - recently used, currently cached 46 * ARC_mru_ghost - recently used, no longer in cache 47 * ARC_mfu - frequently used, currently cached 48 * ARC_mfu_ghost - frequently used, no longer in cache 49 * ARC_l2c_only - exists in L2ARC but not other states 50 * When there are no active references to the buffer, they are 51 * are linked onto a list in one of these arc states. These are 52 * the only buffers that can be evicted or deleted. Within each 53 * state there are multiple lists, one for meta-data and one for 54 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 55 * etc.) is tracked separately so that it can be managed more 56 * explicitly: favored over data, limited explicitly. 57 * 58 * Anonymous buffers are buffers that are not associated with 59 * a DVA. These are buffers that hold dirty block copies 60 * before they are written to stable storage. By definition, 61 * they are "ref'd" and are considered part of arc_mru 62 * that cannot be freed. Generally, they will acquire a DVA 63 * as they are written and migrate onto the arc_mru list. 64 * 65 * The ARC_l2c_only state is for buffers that are in the second 66 * level ARC but no longer in any of the ARC_m* lists. The second 67 * level ARC itself may also contain buffers that are in any of 68 * the ARC_m* states - meaning that a buffer can exist in two 69 * places. The reason for the ARC_l2c_only state is to keep the 70 * buffer header in the hash table, so that reads that hit the 71 * second level ARC benefit from these fast lookups. 72 */ 73 74 typedef struct arc_state { 75 /* 76 * list of evictable buffers 77 */ 78 multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 79 /* 80 * supports the "dbufs" kstat 81 */ 82 arc_state_type_t arcs_state; 83 /* 84 * total amount of evictable data in this state 85 */ 86 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; 87 /* 88 * total amount of data in this state; this includes: evictable, 89 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 90 */ 91 zfs_refcount_t arcs_size; 92 } arc_state_t; 93 94 typedef struct arc_callback arc_callback_t; 95 96 struct arc_callback { 97 void *acb_private; 98 arc_read_done_func_t *acb_done; 99 arc_buf_t *acb_buf; 100 boolean_t acb_encrypted; 101 boolean_t acb_compressed; 102 boolean_t acb_noauth; 103 boolean_t acb_nobuf; 104 zbookmark_phys_t acb_zb; 105 zio_t *acb_zio_dummy; 106 zio_t *acb_zio_head; 107 arc_callback_t *acb_next; 108 }; 109 110 typedef struct arc_write_callback arc_write_callback_t; 111 112 struct arc_write_callback { 113 void *awcb_private; 114 arc_write_done_func_t *awcb_ready; 115 arc_write_done_func_t *awcb_children_ready; 116 arc_write_done_func_t *awcb_physdone; 117 arc_write_done_func_t *awcb_done; 118 arc_buf_t *awcb_buf; 119 }; 120 121 /* 122 * ARC buffers are separated into multiple structs as a memory saving measure: 123 * - Common fields struct, always defined, and embedded within it: 124 * - L2-only fields, always allocated but undefined when not in L2ARC 125 * - L1-only fields, only allocated when in L1ARC 126 * 127 * Buffer in L1 Buffer only in L2 128 * +------------------------+ +------------------------+ 129 * | arc_buf_hdr_t | | arc_buf_hdr_t | 130 * | | | | 131 * | | | | 132 * | | | | 133 * +------------------------+ +------------------------+ 134 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 135 * | (undefined if L1-only) | | | 136 * +------------------------+ +------------------------+ 137 * | l1arc_buf_hdr_t | 138 * | | 139 * | | 140 * | | 141 * | | 142 * +------------------------+ 143 * 144 * Because it's possible for the L2ARC to become extremely large, we can wind 145 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 146 * is minimized by only allocating the fields necessary for an L1-cached buffer 147 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 148 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 149 * words in pointers. arc_hdr_realloc() is used to switch a header between 150 * these two allocation states. 151 */ 152 typedef struct l1arc_buf_hdr { 153 kmutex_t b_freeze_lock; 154 zio_cksum_t *b_freeze_cksum; 155 156 /* for waiting on reads to complete */ 157 kcondvar_t b_cv; 158 uint8_t b_byteswap; 159 160 /* protected by arc state mutex */ 161 arc_state_t *b_state; 162 multilist_node_t b_arc_node; 163 164 /* protected by hash lock */ 165 clock_t b_arc_access; 166 uint32_t b_mru_hits; 167 uint32_t b_mru_ghost_hits; 168 uint32_t b_mfu_hits; 169 uint32_t b_mfu_ghost_hits; 170 uint32_t b_bufcnt; 171 arc_buf_t *b_buf; 172 173 /* self protecting */ 174 zfs_refcount_t b_refcnt; 175 176 arc_callback_t *b_acb; 177 abd_t *b_pabd; 178 } l1arc_buf_hdr_t; 179 180 typedef enum l2arc_dev_hdr_flags_t { 181 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ 182 } l2arc_dev_hdr_flags_t; 183 184 /* 185 * Pointer used in persistent L2ARC (for pointing to log blocks). 186 */ 187 typedef struct l2arc_log_blkptr { 188 /* 189 * Offset of log block within the device, in bytes 190 */ 191 uint64_t lbp_daddr; 192 /* 193 * Aligned payload size (in bytes) of the log block 194 */ 195 uint64_t lbp_payload_asize; 196 /* 197 * Offset in bytes of the first buffer in the payload 198 */ 199 uint64_t lbp_payload_start; 200 /* 201 * lbp_prop has the following format: 202 * * logical size (in bytes) 203 * * aligned (after compression) size (in bytes) 204 * * compression algorithm (we always LZ4-compress l2arc logs) 205 * * checksum algorithm (used for lbp_cksum) 206 */ 207 uint64_t lbp_prop; 208 zio_cksum_t lbp_cksum; /* checksum of log */ 209 } l2arc_log_blkptr_t; 210 211 /* 212 * The persistent L2ARC device header. 213 * Byte order of magic determines whether 64-bit bswap of fields is necessary. 214 */ 215 typedef struct l2arc_dev_hdr_phys { 216 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ 217 uint64_t dh_version; /* Persistent L2ARC version */ 218 219 /* 220 * Global L2ARC device state and metadata. 221 */ 222 uint64_t dh_spa_guid; 223 uint64_t dh_vdev_guid; 224 uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ 225 uint64_t dh_evict; /* evicted offset in bytes */ 226 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ 227 /* 228 * Used in zdb.c for determining if a log block is valid, in the same 229 * way that l2arc_rebuild() does. 230 */ 231 uint64_t dh_start; /* mirror of l2ad_start */ 232 uint64_t dh_end; /* mirror of l2ad_end */ 233 /* 234 * Start of log block chain. [0] -> newest log, [1] -> one older (used 235 * for initiating prefetch). 236 */ 237 l2arc_log_blkptr_t dh_start_lbps[2]; 238 /* 239 * Aligned size of all log blocks as accounted by vdev_space_update(). 240 */ 241 uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ 242 uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ 243 /* 244 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to 245 * display when the cache device was fully trimmed for the last 246 * time. 247 */ 248 uint64_t dh_trim_action_time; 249 uint64_t dh_trim_state; 250 const uint64_t dh_pad[30]; /* pad to 512 bytes */ 251 zio_eck_t dh_tail; 252 } l2arc_dev_hdr_phys_t; 253 _Static_assert(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE, 254 "l2arc_dev_hdr_phys_t wrong size"); 255 256 /* 257 * A single ARC buffer header entry in a l2arc_log_blk_phys_t. 258 */ 259 typedef struct l2arc_log_ent_phys { 260 dva_t le_dva; /* dva of buffer */ 261 uint64_t le_birth; /* birth txg of buffer */ 262 /* 263 * le_prop has the following format: 264 * * logical size (in bytes) 265 * * physical (compressed) size (in bytes) 266 * * compression algorithm 267 * * object type (used to restore arc_buf_contents_t) 268 * * protected status (used for encryption) 269 * * prefetch status (used in l2arc_read_done()) 270 */ 271 uint64_t le_prop; 272 uint64_t le_daddr; /* buf location on l2dev */ 273 uint64_t le_complevel; 274 /* 275 * We pad the size of each entry to a power of 2 so that the size of 276 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, 277 * because of the L2ARC_SET_*SIZE macros. 278 */ 279 const uint64_t le_pad[2]; /* pad to 64 bytes */ 280 } l2arc_log_ent_phys_t; 281 282 #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) 283 284 /* 285 * A log block of up to 1022 ARC buffer log entries, chained into the 286 * persistent L2ARC metadata linked list. Byte order of magic determines 287 * whether 64-bit bswap of fields is necessary. 288 */ 289 typedef struct l2arc_log_blk_phys { 290 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ 291 /* 292 * There are 2 chains (headed by dh_start_lbps[2]), and this field 293 * points back to the previous block in this chain. We alternate 294 * which chain we append to, so they are time-wise and offset-wise 295 * interleaved, but that is an optimization rather than for 296 * correctness. 297 */ 298 l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ 299 /* 300 * Pad header section to 128 bytes 301 */ 302 uint64_t lb_pad[7]; 303 /* Payload */ 304 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; 305 } l2arc_log_blk_phys_t; /* 64K total */ 306 307 /* 308 * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with 309 * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. 310 */ 311 _Static_assert(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 312 1ULL << SPA_MINBLOCKSHIFT), "l2arc_log_blk_phys_t misaligned"); 313 _Static_assert(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE, 314 "l2arc_log_blk_phys_t too small"); 315 _Static_assert(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE, 316 "l2arc_log_blk_phys_t too big"); 317 318 /* 319 * These structures hold in-flight abd buffers for log blocks as they're being 320 * written to the L2ARC device. 321 */ 322 typedef struct l2arc_lb_abd_buf { 323 abd_t *abd; 324 list_node_t node; 325 } l2arc_lb_abd_buf_t; 326 327 /* 328 * These structures hold pointers to log blocks present on the L2ARC device. 329 */ 330 typedef struct l2arc_lb_ptr_buf { 331 l2arc_log_blkptr_t *lb_ptr; 332 list_node_t node; 333 } l2arc_lb_ptr_buf_t; 334 335 /* Macros for setting fields in le_prop and lbp_prop */ 336 #define L2BLK_GET_LSIZE(field) \ 337 BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) 338 #define L2BLK_SET_LSIZE(field, x) \ 339 BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 340 #define L2BLK_GET_PSIZE(field) \ 341 BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) 342 #define L2BLK_SET_PSIZE(field, x) \ 343 BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 344 #define L2BLK_GET_COMPRESS(field) \ 345 BF64_GET((field), 32, SPA_COMPRESSBITS) 346 #define L2BLK_SET_COMPRESS(field, x) \ 347 BF64_SET((field), 32, SPA_COMPRESSBITS, x) 348 #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) 349 #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) 350 #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) 351 #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) 352 #define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) 353 #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) 354 #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) 355 #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) 356 #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) 357 #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) 358 359 #define PTR_SWAP(x, y) \ 360 do { \ 361 void *tmp = (x);\ 362 x = y; \ 363 y = tmp; \ 364 } while (0) 365 366 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ 367 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ 368 369 /* 370 * L2ARC Internals 371 */ 372 typedef struct l2arc_dev { 373 vdev_t *l2ad_vdev; /* vdev */ 374 spa_t *l2ad_spa; /* spa */ 375 uint64_t l2ad_hand; /* next write location */ 376 uint64_t l2ad_start; /* first addr on device */ 377 uint64_t l2ad_end; /* last addr on device */ 378 boolean_t l2ad_first; /* first sweep through */ 379 boolean_t l2ad_writing; /* currently writing */ 380 kmutex_t l2ad_mtx; /* lock for buffer list */ 381 list_t l2ad_buflist; /* buffer list */ 382 list_node_t l2ad_node; /* device list node */ 383 zfs_refcount_t l2ad_alloc; /* allocated bytes */ 384 /* 385 * Persistence-related stuff 386 */ 387 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ 388 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ 389 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ 390 int l2ad_log_ent_idx; /* index into cur log blk */ 391 /* Number of bytes in current log block's payload */ 392 uint64_t l2ad_log_blk_payload_asize; 393 /* 394 * Offset (in bytes) of the first buffer in current log block's 395 * payload. 396 */ 397 uint64_t l2ad_log_blk_payload_start; 398 /* Flag indicating whether a rebuild is scheduled or is going on */ 399 boolean_t l2ad_rebuild; 400 boolean_t l2ad_rebuild_cancel; 401 boolean_t l2ad_rebuild_began; 402 uint64_t l2ad_log_entries; /* entries per log blk */ 403 uint64_t l2ad_evict; /* evicted offset in bytes */ 404 /* List of pointers to log blocks present in the L2ARC device */ 405 list_t l2ad_lbptr_list; 406 /* 407 * Aligned size of all log blocks as accounted by vdev_space_update(). 408 */ 409 zfs_refcount_t l2ad_lb_asize; 410 /* 411 * Number of log blocks present on the device. 412 */ 413 zfs_refcount_t l2ad_lb_count; 414 boolean_t l2ad_trim_all; /* TRIM whole device */ 415 } l2arc_dev_t; 416 417 /* 418 * Encrypted blocks will need to be stored encrypted on the L2ARC 419 * disk as they appear in the main pool. In order for this to work we 420 * need to pass around the encryption parameters so they can be used 421 * to write data to the L2ARC. This struct is only defined in the 422 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED 423 * flag set. 424 */ 425 typedef struct arc_buf_hdr_crypt { 426 abd_t *b_rabd; /* raw encrypted data */ 427 dmu_object_type_t b_ot; /* object type */ 428 uint32_t b_ebufcnt; /* count of encrypted buffers */ 429 430 /* dsobj for looking up encryption key for l2arc encryption */ 431 uint64_t b_dsobj; 432 433 /* encryption parameters */ 434 uint8_t b_salt[ZIO_DATA_SALT_LEN]; 435 uint8_t b_iv[ZIO_DATA_IV_LEN]; 436 437 /* 438 * Technically this could be removed since we will always be able to 439 * get the mac from the bp when we need it. However, it is inconvenient 440 * for callers of arc code to have to pass a bp in all the time. This 441 * also allows us to assert that L2ARC data is properly encrypted to 442 * match the data in the main storage pool. 443 */ 444 uint8_t b_mac[ZIO_DATA_MAC_LEN]; 445 } arc_buf_hdr_crypt_t; 446 447 typedef struct l2arc_buf_hdr { 448 /* protected by arc_buf_hdr mutex */ 449 l2arc_dev_t *b_dev; /* L2ARC device */ 450 uint64_t b_daddr; /* disk address, offset byte */ 451 uint32_t b_hits; 452 arc_state_type_t b_arcs_state; 453 list_node_t b_l2node; 454 } l2arc_buf_hdr_t; 455 456 typedef struct l2arc_write_callback { 457 l2arc_dev_t *l2wcb_dev; /* device info */ 458 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 459 /* in-flight list of log blocks */ 460 list_t l2wcb_abd_list; 461 } l2arc_write_callback_t; 462 463 struct arc_buf_hdr { 464 /* protected by hash lock */ 465 dva_t b_dva; 466 uint64_t b_birth; 467 468 arc_buf_contents_t b_type; 469 uint8_t b_complevel; 470 uint8_t b_reserved1; /* used for 4 byte alignment */ 471 uint16_t b_reserved2; /* used for 4 byte alignment */ 472 arc_buf_hdr_t *b_hash_next; 473 arc_flags_t b_flags; 474 475 /* 476 * This field stores the size of the data buffer after 477 * compression, and is set in the arc's zio completion handlers. 478 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 479 * 480 * While the block pointers can store up to 32MB in their psize 481 * field, we can only store up to 32MB minus 512B. This is due 482 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 483 * a field of zeros represents 512B in the bp). We can't use a 484 * bias of 1 since we need to reserve a psize of zero, here, to 485 * represent holes and embedded blocks. 486 * 487 * This isn't a problem in practice, since the maximum size of a 488 * buffer is limited to 16MB, so we never need to store 32MB in 489 * this field. Even in the upstream illumos code base, the 490 * maximum size of a buffer is limited to 16MB. 491 */ 492 uint16_t b_psize; 493 494 /* 495 * This field stores the size of the data buffer before 496 * compression, and cannot change once set. It is in units 497 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 498 */ 499 uint16_t b_lsize; /* immutable */ 500 uint64_t b_spa; /* immutable */ 501 502 /* L2ARC fields. Undefined when not in L2ARC. */ 503 l2arc_buf_hdr_t b_l2hdr; 504 /* L1ARC fields. Undefined when in l2arc_only state */ 505 l1arc_buf_hdr_t b_l1hdr; 506 /* 507 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED 508 * is set and the L1 header exists. 509 */ 510 arc_buf_hdr_crypt_t b_crypt_hdr; 511 }; 512 513 typedef struct arc_stats { 514 kstat_named_t arcstat_hits; 515 kstat_named_t arcstat_misses; 516 kstat_named_t arcstat_demand_data_hits; 517 kstat_named_t arcstat_demand_data_misses; 518 kstat_named_t arcstat_demand_metadata_hits; 519 kstat_named_t arcstat_demand_metadata_misses; 520 kstat_named_t arcstat_prefetch_data_hits; 521 kstat_named_t arcstat_prefetch_data_misses; 522 kstat_named_t arcstat_prefetch_metadata_hits; 523 kstat_named_t arcstat_prefetch_metadata_misses; 524 kstat_named_t arcstat_mru_hits; 525 kstat_named_t arcstat_mru_ghost_hits; 526 kstat_named_t arcstat_mfu_hits; 527 kstat_named_t arcstat_mfu_ghost_hits; 528 kstat_named_t arcstat_deleted; 529 /* 530 * Number of buffers that could not be evicted because the hash lock 531 * was held by another thread. The lock may not necessarily be held 532 * by something using the same buffer, since hash locks are shared 533 * by multiple buffers. 534 */ 535 kstat_named_t arcstat_mutex_miss; 536 /* 537 * Number of buffers skipped when updating the access state due to the 538 * header having already been released after acquiring the hash lock. 539 */ 540 kstat_named_t arcstat_access_skip; 541 /* 542 * Number of buffers skipped because they have I/O in progress, are 543 * indirect prefetch buffers that have not lived long enough, or are 544 * not from the spa we're trying to evict from. 545 */ 546 kstat_named_t arcstat_evict_skip; 547 /* 548 * Number of times arc_evict_state() was unable to evict enough 549 * buffers to reach its target amount. 550 */ 551 kstat_named_t arcstat_evict_not_enough; 552 kstat_named_t arcstat_evict_l2_cached; 553 kstat_named_t arcstat_evict_l2_eligible; 554 kstat_named_t arcstat_evict_l2_eligible_mfu; 555 kstat_named_t arcstat_evict_l2_eligible_mru; 556 kstat_named_t arcstat_evict_l2_ineligible; 557 kstat_named_t arcstat_evict_l2_skip; 558 kstat_named_t arcstat_hash_elements; 559 kstat_named_t arcstat_hash_elements_max; 560 kstat_named_t arcstat_hash_collisions; 561 kstat_named_t arcstat_hash_chains; 562 kstat_named_t arcstat_hash_chain_max; 563 kstat_named_t arcstat_p; 564 kstat_named_t arcstat_c; 565 kstat_named_t arcstat_c_min; 566 kstat_named_t arcstat_c_max; 567 kstat_named_t arcstat_size; 568 /* 569 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 570 * Note that the compressed bytes may match the uncompressed bytes 571 * if the block is either not compressed or compressed arc is disabled. 572 */ 573 kstat_named_t arcstat_compressed_size; 574 /* 575 * Uncompressed size of the data stored in b_pabd. If compressed 576 * arc is disabled then this value will be identical to the stat 577 * above. 578 */ 579 kstat_named_t arcstat_uncompressed_size; 580 /* 581 * Number of bytes stored in all the arc_buf_t's. This is classified 582 * as "overhead" since this data is typically short-lived and will 583 * be evicted from the arc when it becomes unreferenced unless the 584 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 585 * values have been set (see comment in dbuf.c for more information). 586 */ 587 kstat_named_t arcstat_overhead_size; 588 /* 589 * Number of bytes consumed by internal ARC structures necessary 590 * for tracking purposes; these structures are not actually 591 * backed by ARC buffers. This includes arc_buf_hdr_t structures 592 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 593 * caches), and arc_buf_t structures (allocated via arc_buf_t 594 * cache). 595 */ 596 kstat_named_t arcstat_hdr_size; 597 /* 598 * Number of bytes consumed by ARC buffers of type equal to 599 * ARC_BUFC_DATA. This is generally consumed by buffers backing 600 * on disk user data (e.g. plain file contents). 601 */ 602 kstat_named_t arcstat_data_size; 603 /* 604 * Number of bytes consumed by ARC buffers of type equal to 605 * ARC_BUFC_METADATA. This is generally consumed by buffers 606 * backing on disk data that is used for internal ZFS 607 * structures (e.g. ZAP, dnode, indirect blocks, etc). 608 */ 609 kstat_named_t arcstat_metadata_size; 610 /* 611 * Number of bytes consumed by dmu_buf_impl_t objects. 612 */ 613 kstat_named_t arcstat_dbuf_size; 614 /* 615 * Number of bytes consumed by dnode_t objects. 616 */ 617 kstat_named_t arcstat_dnode_size; 618 /* 619 * Number of bytes consumed by bonus buffers. 620 */ 621 kstat_named_t arcstat_bonus_size; 622 #if defined(COMPAT_FREEBSD11) 623 /* 624 * Sum of the previous three counters, provided for compatibility. 625 */ 626 kstat_named_t arcstat_other_size; 627 #endif 628 629 /* 630 * Total number of bytes consumed by ARC buffers residing in the 631 * arc_anon state. This includes *all* buffers in the arc_anon 632 * state; e.g. data, metadata, evictable, and unevictable buffers 633 * are all included in this value. 634 */ 635 kstat_named_t arcstat_anon_size; 636 /* 637 * Number of bytes consumed by ARC buffers that meet the 638 * following criteria: backing buffers of type ARC_BUFC_DATA, 639 * residing in the arc_anon state, and are eligible for eviction 640 * (e.g. have no outstanding holds on the buffer). 641 */ 642 kstat_named_t arcstat_anon_evictable_data; 643 /* 644 * Number of bytes consumed by ARC buffers that meet the 645 * following criteria: backing buffers of type ARC_BUFC_METADATA, 646 * residing in the arc_anon state, and are eligible for eviction 647 * (e.g. have no outstanding holds on the buffer). 648 */ 649 kstat_named_t arcstat_anon_evictable_metadata; 650 /* 651 * Total number of bytes consumed by ARC buffers residing in the 652 * arc_mru state. This includes *all* buffers in the arc_mru 653 * state; e.g. data, metadata, evictable, and unevictable buffers 654 * are all included in this value. 655 */ 656 kstat_named_t arcstat_mru_size; 657 /* 658 * Number of bytes consumed by ARC buffers that meet the 659 * following criteria: backing buffers of type ARC_BUFC_DATA, 660 * residing in the arc_mru state, and are eligible for eviction 661 * (e.g. have no outstanding holds on the buffer). 662 */ 663 kstat_named_t arcstat_mru_evictable_data; 664 /* 665 * Number of bytes consumed by ARC buffers that meet the 666 * following criteria: backing buffers of type ARC_BUFC_METADATA, 667 * residing in the arc_mru state, and are eligible for eviction 668 * (e.g. have no outstanding holds on the buffer). 669 */ 670 kstat_named_t arcstat_mru_evictable_metadata; 671 /* 672 * Total number of bytes that *would have been* consumed by ARC 673 * buffers in the arc_mru_ghost state. The key thing to note 674 * here, is the fact that this size doesn't actually indicate 675 * RAM consumption. The ghost lists only consist of headers and 676 * don't actually have ARC buffers linked off of these headers. 677 * Thus, *if* the headers had associated ARC buffers, these 678 * buffers *would have* consumed this number of bytes. 679 */ 680 kstat_named_t arcstat_mru_ghost_size; 681 /* 682 * Number of bytes that *would have been* consumed by ARC 683 * buffers that are eligible for eviction, of type 684 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 685 */ 686 kstat_named_t arcstat_mru_ghost_evictable_data; 687 /* 688 * Number of bytes that *would have been* consumed by ARC 689 * buffers that are eligible for eviction, of type 690 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 691 */ 692 kstat_named_t arcstat_mru_ghost_evictable_metadata; 693 /* 694 * Total number of bytes consumed by ARC buffers residing in the 695 * arc_mfu state. This includes *all* buffers in the arc_mfu 696 * state; e.g. data, metadata, evictable, and unevictable buffers 697 * are all included in this value. 698 */ 699 kstat_named_t arcstat_mfu_size; 700 /* 701 * Number of bytes consumed by ARC buffers that are eligible for 702 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 703 * state. 704 */ 705 kstat_named_t arcstat_mfu_evictable_data; 706 /* 707 * Number of bytes consumed by ARC buffers that are eligible for 708 * eviction, of type ARC_BUFC_METADATA, and reside in the 709 * arc_mfu state. 710 */ 711 kstat_named_t arcstat_mfu_evictable_metadata; 712 /* 713 * Total number of bytes that *would have been* consumed by ARC 714 * buffers in the arc_mfu_ghost state. See the comment above 715 * arcstat_mru_ghost_size for more details. 716 */ 717 kstat_named_t arcstat_mfu_ghost_size; 718 /* 719 * Number of bytes that *would have been* consumed by ARC 720 * buffers that are eligible for eviction, of type 721 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 722 */ 723 kstat_named_t arcstat_mfu_ghost_evictable_data; 724 /* 725 * Number of bytes that *would have been* consumed by ARC 726 * buffers that are eligible for eviction, of type 727 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 728 */ 729 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 730 kstat_named_t arcstat_l2_hits; 731 kstat_named_t arcstat_l2_misses; 732 /* 733 * Allocated size (in bytes) of L2ARC cached buffers by ARC state. 734 */ 735 kstat_named_t arcstat_l2_prefetch_asize; 736 kstat_named_t arcstat_l2_mru_asize; 737 kstat_named_t arcstat_l2_mfu_asize; 738 /* 739 * Allocated size (in bytes) of L2ARC cached buffers by buffer content 740 * type. 741 */ 742 kstat_named_t arcstat_l2_bufc_data_asize; 743 kstat_named_t arcstat_l2_bufc_metadata_asize; 744 kstat_named_t arcstat_l2_feeds; 745 kstat_named_t arcstat_l2_rw_clash; 746 kstat_named_t arcstat_l2_read_bytes; 747 kstat_named_t arcstat_l2_write_bytes; 748 kstat_named_t arcstat_l2_writes_sent; 749 kstat_named_t arcstat_l2_writes_done; 750 kstat_named_t arcstat_l2_writes_error; 751 kstat_named_t arcstat_l2_writes_lock_retry; 752 kstat_named_t arcstat_l2_evict_lock_retry; 753 kstat_named_t arcstat_l2_evict_reading; 754 kstat_named_t arcstat_l2_evict_l1cached; 755 kstat_named_t arcstat_l2_free_on_write; 756 kstat_named_t arcstat_l2_abort_lowmem; 757 kstat_named_t arcstat_l2_cksum_bad; 758 kstat_named_t arcstat_l2_io_error; 759 kstat_named_t arcstat_l2_lsize; 760 kstat_named_t arcstat_l2_psize; 761 kstat_named_t arcstat_l2_hdr_size; 762 /* 763 * Number of L2ARC log blocks written. These are used for restoring the 764 * L2ARC. Updated during writing of L2ARC log blocks. 765 */ 766 kstat_named_t arcstat_l2_log_blk_writes; 767 /* 768 * Moving average of the aligned size of the L2ARC log blocks, in 769 * bytes. Updated during L2ARC rebuild and during writing of L2ARC 770 * log blocks. 771 */ 772 kstat_named_t arcstat_l2_log_blk_avg_asize; 773 /* Aligned size of L2ARC log blocks on L2ARC devices. */ 774 kstat_named_t arcstat_l2_log_blk_asize; 775 /* Number of L2ARC log blocks present on L2ARC devices. */ 776 kstat_named_t arcstat_l2_log_blk_count; 777 /* 778 * Moving average of the aligned size of L2ARC restored data, in bytes, 779 * to the aligned size of their metadata in L2ARC, in bytes. 780 * Updated during L2ARC rebuild and during writing of L2ARC log blocks. 781 */ 782 kstat_named_t arcstat_l2_data_to_meta_ratio; 783 /* 784 * Number of times the L2ARC rebuild was successful for an L2ARC device. 785 */ 786 kstat_named_t arcstat_l2_rebuild_success; 787 /* 788 * Number of times the L2ARC rebuild failed because the device header 789 * was in an unsupported format or corrupted. 790 */ 791 kstat_named_t arcstat_l2_rebuild_abort_unsupported; 792 /* 793 * Number of times the L2ARC rebuild failed because of IO errors 794 * while reading a log block. 795 */ 796 kstat_named_t arcstat_l2_rebuild_abort_io_errors; 797 /* 798 * Number of times the L2ARC rebuild failed because of IO errors when 799 * reading the device header. 800 */ 801 kstat_named_t arcstat_l2_rebuild_abort_dh_errors; 802 /* 803 * Number of L2ARC log blocks which failed to be restored due to 804 * checksum errors. 805 */ 806 kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; 807 /* 808 * Number of times the L2ARC rebuild was aborted due to low system 809 * memory. 810 */ 811 kstat_named_t arcstat_l2_rebuild_abort_lowmem; 812 /* Logical size of L2ARC restored data, in bytes. */ 813 kstat_named_t arcstat_l2_rebuild_size; 814 /* Aligned size of L2ARC restored data, in bytes. */ 815 kstat_named_t arcstat_l2_rebuild_asize; 816 /* 817 * Number of L2ARC log entries (buffers) that were successfully 818 * restored in ARC. 819 */ 820 kstat_named_t arcstat_l2_rebuild_bufs; 821 /* 822 * Number of L2ARC log entries (buffers) already cached in ARC. These 823 * were not restored again. 824 */ 825 kstat_named_t arcstat_l2_rebuild_bufs_precached; 826 /* 827 * Number of L2ARC log blocks that were restored successfully. Each 828 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. 829 */ 830 kstat_named_t arcstat_l2_rebuild_log_blks; 831 kstat_named_t arcstat_memory_throttle_count; 832 kstat_named_t arcstat_memory_direct_count; 833 kstat_named_t arcstat_memory_indirect_count; 834 kstat_named_t arcstat_memory_all_bytes; 835 kstat_named_t arcstat_memory_free_bytes; 836 kstat_named_t arcstat_memory_available_bytes; 837 kstat_named_t arcstat_no_grow; 838 kstat_named_t arcstat_tempreserve; 839 kstat_named_t arcstat_loaned_bytes; 840 kstat_named_t arcstat_prune; 841 kstat_named_t arcstat_meta_used; 842 kstat_named_t arcstat_meta_limit; 843 kstat_named_t arcstat_dnode_limit; 844 kstat_named_t arcstat_meta_max; 845 kstat_named_t arcstat_meta_min; 846 kstat_named_t arcstat_async_upgrade_sync; 847 kstat_named_t arcstat_demand_hit_predictive_prefetch; 848 kstat_named_t arcstat_demand_hit_prescient_prefetch; 849 kstat_named_t arcstat_need_free; 850 kstat_named_t arcstat_sys_free; 851 kstat_named_t arcstat_raw_size; 852 kstat_named_t arcstat_cached_only_in_progress; 853 kstat_named_t arcstat_abd_chunk_waste_size; 854 } arc_stats_t; 855 856 typedef struct arc_sums { 857 wmsum_t arcstat_hits; 858 wmsum_t arcstat_misses; 859 wmsum_t arcstat_demand_data_hits; 860 wmsum_t arcstat_demand_data_misses; 861 wmsum_t arcstat_demand_metadata_hits; 862 wmsum_t arcstat_demand_metadata_misses; 863 wmsum_t arcstat_prefetch_data_hits; 864 wmsum_t arcstat_prefetch_data_misses; 865 wmsum_t arcstat_prefetch_metadata_hits; 866 wmsum_t arcstat_prefetch_metadata_misses; 867 wmsum_t arcstat_mru_hits; 868 wmsum_t arcstat_mru_ghost_hits; 869 wmsum_t arcstat_mfu_hits; 870 wmsum_t arcstat_mfu_ghost_hits; 871 wmsum_t arcstat_deleted; 872 wmsum_t arcstat_mutex_miss; 873 wmsum_t arcstat_access_skip; 874 wmsum_t arcstat_evict_skip; 875 wmsum_t arcstat_evict_not_enough; 876 wmsum_t arcstat_evict_l2_cached; 877 wmsum_t arcstat_evict_l2_eligible; 878 wmsum_t arcstat_evict_l2_eligible_mfu; 879 wmsum_t arcstat_evict_l2_eligible_mru; 880 wmsum_t arcstat_evict_l2_ineligible; 881 wmsum_t arcstat_evict_l2_skip; 882 wmsum_t arcstat_hash_collisions; 883 wmsum_t arcstat_hash_chains; 884 aggsum_t arcstat_size; 885 wmsum_t arcstat_compressed_size; 886 wmsum_t arcstat_uncompressed_size; 887 wmsum_t arcstat_overhead_size; 888 wmsum_t arcstat_hdr_size; 889 wmsum_t arcstat_data_size; 890 wmsum_t arcstat_metadata_size; 891 wmsum_t arcstat_dbuf_size; 892 aggsum_t arcstat_dnode_size; 893 wmsum_t arcstat_bonus_size; 894 wmsum_t arcstat_l2_hits; 895 wmsum_t arcstat_l2_misses; 896 wmsum_t arcstat_l2_prefetch_asize; 897 wmsum_t arcstat_l2_mru_asize; 898 wmsum_t arcstat_l2_mfu_asize; 899 wmsum_t arcstat_l2_bufc_data_asize; 900 wmsum_t arcstat_l2_bufc_metadata_asize; 901 wmsum_t arcstat_l2_feeds; 902 wmsum_t arcstat_l2_rw_clash; 903 wmsum_t arcstat_l2_read_bytes; 904 wmsum_t arcstat_l2_write_bytes; 905 wmsum_t arcstat_l2_writes_sent; 906 wmsum_t arcstat_l2_writes_done; 907 wmsum_t arcstat_l2_writes_error; 908 wmsum_t arcstat_l2_writes_lock_retry; 909 wmsum_t arcstat_l2_evict_lock_retry; 910 wmsum_t arcstat_l2_evict_reading; 911 wmsum_t arcstat_l2_evict_l1cached; 912 wmsum_t arcstat_l2_free_on_write; 913 wmsum_t arcstat_l2_abort_lowmem; 914 wmsum_t arcstat_l2_cksum_bad; 915 wmsum_t arcstat_l2_io_error; 916 wmsum_t arcstat_l2_lsize; 917 wmsum_t arcstat_l2_psize; 918 aggsum_t arcstat_l2_hdr_size; 919 wmsum_t arcstat_l2_log_blk_writes; 920 wmsum_t arcstat_l2_log_blk_asize; 921 wmsum_t arcstat_l2_log_blk_count; 922 wmsum_t arcstat_l2_rebuild_success; 923 wmsum_t arcstat_l2_rebuild_abort_unsupported; 924 wmsum_t arcstat_l2_rebuild_abort_io_errors; 925 wmsum_t arcstat_l2_rebuild_abort_dh_errors; 926 wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; 927 wmsum_t arcstat_l2_rebuild_abort_lowmem; 928 wmsum_t arcstat_l2_rebuild_size; 929 wmsum_t arcstat_l2_rebuild_asize; 930 wmsum_t arcstat_l2_rebuild_bufs; 931 wmsum_t arcstat_l2_rebuild_bufs_precached; 932 wmsum_t arcstat_l2_rebuild_log_blks; 933 wmsum_t arcstat_memory_throttle_count; 934 wmsum_t arcstat_memory_direct_count; 935 wmsum_t arcstat_memory_indirect_count; 936 wmsum_t arcstat_prune; 937 aggsum_t arcstat_meta_used; 938 wmsum_t arcstat_async_upgrade_sync; 939 wmsum_t arcstat_demand_hit_predictive_prefetch; 940 wmsum_t arcstat_demand_hit_prescient_prefetch; 941 wmsum_t arcstat_raw_size; 942 wmsum_t arcstat_cached_only_in_progress; 943 wmsum_t arcstat_abd_chunk_waste_size; 944 } arc_sums_t; 945 946 typedef struct arc_evict_waiter { 947 list_node_t aew_node; 948 kcondvar_t aew_cv; 949 uint64_t aew_count; 950 } arc_evict_waiter_t; 951 952 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 953 954 #define ARCSTAT_INCR(stat, val) \ 955 wmsum_add(&arc_sums.stat, (val)) 956 957 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 958 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 959 960 #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ 961 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 962 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 963 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 964 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 965 #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ 966 967 #define arc_anon (&ARC_anon) 968 #define arc_mru (&ARC_mru) 969 #define arc_mru_ghost (&ARC_mru_ghost) 970 #define arc_mfu (&ARC_mfu) 971 #define arc_mfu_ghost (&ARC_mfu_ghost) 972 #define arc_l2c_only (&ARC_l2c_only) 973 974 extern taskq_t *arc_prune_taskq; 975 extern arc_stats_t arc_stats; 976 extern arc_sums_t arc_sums; 977 extern hrtime_t arc_growtime; 978 extern boolean_t arc_warm; 979 extern int arc_grow_retry; 980 extern int arc_no_grow_shift; 981 extern int arc_shrink_shift; 982 extern kmutex_t arc_prune_mtx; 983 extern list_t arc_prune_list; 984 extern arc_state_t ARC_mfu; 985 extern arc_state_t ARC_mru; 986 extern uint_t zfs_arc_pc_percent; 987 extern int arc_lotsfree_percent; 988 extern unsigned long zfs_arc_min; 989 extern unsigned long zfs_arc_max; 990 991 extern void arc_reduce_target_size(int64_t to_free); 992 extern boolean_t arc_reclaim_needed(void); 993 extern void arc_kmem_reap_soon(void); 994 extern void arc_wait_for_eviction(uint64_t, boolean_t); 995 996 extern void arc_lowmem_init(void); 997 extern void arc_lowmem_fini(void); 998 extern void arc_prune_async(int64_t); 999 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); 1000 extern uint64_t arc_free_memory(void); 1001 extern int64_t arc_available_memory(void); 1002 extern void arc_tuning_update(boolean_t); 1003 extern void arc_register_hotplug(void); 1004 extern void arc_unregister_hotplug(void); 1005 1006 extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); 1007 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); 1008 extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); 1009 extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); 1010 1011 /* used in zdb.c */ 1012 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, 1013 const l2arc_log_blkptr_t *lbp); 1014 1015 /* used in vdev_trim.c */ 1016 void l2arc_dev_hdr_update(l2arc_dev_t *dev); 1017 l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); 1018 1019 #ifdef __cplusplus 1020 } 1021 #endif 1022 1023 #endif /* _SYS_ARC_IMPL_H */ 1024