1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, Delphix. All rights reserved.
24  * Copyright (c) 2013, Saso Kiselkov. All rights reserved.
25  * Copyright (c) 2013, Nexenta Systems, Inc.  All rights reserved.
26  * Copyright (c) 2020, George Amanakis. All rights reserved.
27  */
28 
29 #ifndef _SYS_ARC_IMPL_H
30 #define	_SYS_ARC_IMPL_H
31 
32 #include <sys/arc.h>
33 #include <sys/zio_crypt.h>
34 #include <sys/zthr.h>
35 #include <sys/aggsum.h>
36 
37 #ifdef __cplusplus
38 extern "C" {
39 #endif
40 
41 /*
42  * Note that buffers can be in one of 6 states:
43  *	ARC_anon	- anonymous (discussed below)
44  *	ARC_mru		- recently used, currently cached
45  *	ARC_mru_ghost	- recently used, no longer in cache
46  *	ARC_mfu		- frequently used, currently cached
47  *	ARC_mfu_ghost	- frequently used, no longer in cache
48  *	ARC_l2c_only	- exists in L2ARC but not other states
49  * When there are no active references to the buffer, they are
50  * are linked onto a list in one of these arc states.  These are
51  * the only buffers that can be evicted or deleted.  Within each
52  * state there are multiple lists, one for meta-data and one for
53  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
54  * etc.) is tracked separately so that it can be managed more
55  * explicitly: favored over data, limited explicitly.
56  *
57  * Anonymous buffers are buffers that are not associated with
58  * a DVA.  These are buffers that hold dirty block copies
59  * before they are written to stable storage.  By definition,
60  * they are "ref'd" and are considered part of arc_mru
61  * that cannot be freed.  Generally, they will acquire a DVA
62  * as they are written and migrate onto the arc_mru list.
63  *
64  * The ARC_l2c_only state is for buffers that are in the second
65  * level ARC but no longer in any of the ARC_m* lists.  The second
66  * level ARC itself may also contain buffers that are in any of
67  * the ARC_m* states - meaning that a buffer can exist in two
68  * places.  The reason for the ARC_l2c_only state is to keep the
69  * buffer header in the hash table, so that reads that hit the
70  * second level ARC benefit from these fast lookups.
71  */
72 
73 typedef struct arc_state {
74 	/*
75 	 * list of evictable buffers
76 	 */
77 	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
78 	/*
79 	 * total amount of evictable data in this state
80 	 */
81 	zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
82 	/*
83 	 * total amount of data in this state; this includes: evictable,
84 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
85 	 */
86 	zfs_refcount_t arcs_size;
87 	/*
88 	 * supports the "dbufs" kstat
89 	 */
90 	arc_state_type_t arcs_state;
91 } arc_state_t;
92 
93 typedef struct arc_callback arc_callback_t;
94 
95 struct arc_callback {
96 	void			*acb_private;
97 	arc_read_done_func_t	*acb_done;
98 	arc_buf_t		*acb_buf;
99 	boolean_t		acb_encrypted;
100 	boolean_t		acb_compressed;
101 	boolean_t		acb_noauth;
102 	zbookmark_phys_t	acb_zb;
103 	zio_t			*acb_zio_dummy;
104 	zio_t			*acb_zio_head;
105 	arc_callback_t		*acb_next;
106 };
107 
108 typedef struct arc_write_callback arc_write_callback_t;
109 
110 struct arc_write_callback {
111 	void			*awcb_private;
112 	arc_write_done_func_t	*awcb_ready;
113 	arc_write_done_func_t	*awcb_children_ready;
114 	arc_write_done_func_t	*awcb_physdone;
115 	arc_write_done_func_t	*awcb_done;
116 	arc_buf_t		*awcb_buf;
117 };
118 
119 /*
120  * ARC buffers are separated into multiple structs as a memory saving measure:
121  *   - Common fields struct, always defined, and embedded within it:
122  *       - L2-only fields, always allocated but undefined when not in L2ARC
123  *       - L1-only fields, only allocated when in L1ARC
124  *
125  *           Buffer in L1                     Buffer only in L2
126  *    +------------------------+          +------------------------+
127  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
128  *    |                        |          |                        |
129  *    |                        |          |                        |
130  *    |                        |          |                        |
131  *    +------------------------+          +------------------------+
132  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
133  *    | (undefined if L1-only) |          |                        |
134  *    +------------------------+          +------------------------+
135  *    | l1arc_buf_hdr_t        |
136  *    |                        |
137  *    |                        |
138  *    |                        |
139  *    |                        |
140  *    +------------------------+
141  *
142  * Because it's possible for the L2ARC to become extremely large, we can wind
143  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
144  * is minimized by only allocating the fields necessary for an L1-cached buffer
145  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
146  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
147  * words in pointers. arc_hdr_realloc() is used to switch a header between
148  * these two allocation states.
149  */
150 typedef struct l1arc_buf_hdr {
151 	kmutex_t		b_freeze_lock;
152 	zio_cksum_t		*b_freeze_cksum;
153 
154 	arc_buf_t		*b_buf;
155 	uint32_t		b_bufcnt;
156 	/* for waiting on writes to complete */
157 	kcondvar_t		b_cv;
158 	uint8_t			b_byteswap;
159 
160 
161 	/* protected by arc state mutex */
162 	arc_state_t		*b_state;
163 	multilist_node_t	b_arc_node;
164 
165 	/* updated atomically */
166 	clock_t			b_arc_access;
167 	uint32_t		b_mru_hits;
168 	uint32_t		b_mru_ghost_hits;
169 	uint32_t		b_mfu_hits;
170 	uint32_t		b_mfu_ghost_hits;
171 	uint32_t		b_l2_hits;
172 
173 	/* self protecting */
174 	zfs_refcount_t		b_refcnt;
175 
176 	arc_callback_t		*b_acb;
177 	abd_t			*b_pabd;
178 } l1arc_buf_hdr_t;
179 
180 typedef enum l2arc_dev_hdr_flags_t {
181 	L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)	/* mirror of l2ad_first */
182 } l2arc_dev_hdr_flags_t;
183 
184 /*
185  * Pointer used in persistent L2ARC (for pointing to log blocks).
186  */
187 typedef struct l2arc_log_blkptr {
188 	/*
189 	 * Offset of log block within the device, in bytes
190 	 */
191 	uint64_t	lbp_daddr;
192 	/*
193 	 * Aligned payload size (in bytes) of the log block
194 	 */
195 	uint64_t	lbp_payload_asize;
196 	/*
197 	 * Offset in bytes of the first buffer in the payload
198 	 */
199 	uint64_t	lbp_payload_start;
200 	/*
201 	 * lbp_prop has the following format:
202 	 *	* logical size (in bytes)
203 	 *	* aligned (after compression) size (in bytes)
204 	 *	* compression algorithm (we always LZ4-compress l2arc logs)
205 	 *	* checksum algorithm (used for lbp_cksum)
206 	 */
207 	uint64_t	lbp_prop;
208 	zio_cksum_t	lbp_cksum;	/* checksum of log */
209 } l2arc_log_blkptr_t;
210 
211 /*
212  * The persistent L2ARC device header.
213  * Byte order of magic determines whether 64-bit bswap of fields is necessary.
214  */
215 typedef struct l2arc_dev_hdr_phys {
216 	uint64_t	dh_magic;	/* L2ARC_DEV_HDR_MAGIC */
217 	uint64_t	dh_version;	/* Persistent L2ARC version */
218 
219 	/*
220 	 * Global L2ARC device state and metadata.
221 	 */
222 	uint64_t	dh_spa_guid;
223 	uint64_t	dh_vdev_guid;
224 	uint64_t	dh_log_entries;		/* mirror of l2ad_log_entries */
225 	uint64_t	dh_evict;		/* evicted offset in bytes */
226 	uint64_t	dh_flags;		/* l2arc_dev_hdr_flags_t */
227 	/*
228 	 * Used in zdb.c for determining if a log block is valid, in the same
229 	 * way that l2arc_rebuild() does.
230 	 */
231 	uint64_t	dh_start;		/* mirror of l2ad_start */
232 	uint64_t	dh_end;			/* mirror of l2ad_end */
233 	/*
234 	 * Start of log block chain. [0] -> newest log, [1] -> one older (used
235 	 * for initiating prefetch).
236 	 */
237 	l2arc_log_blkptr_t	dh_start_lbps[2];
238 	/*
239 	 * Aligned size of all log blocks as accounted by vdev_space_update().
240 	 */
241 	uint64_t	dh_lb_asize;		/* mirror of l2ad_lb_asize */
242 	uint64_t	dh_lb_count;		/* mirror of l2ad_lb_count */
243 	/*
244 	 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to
245 	 * display when the cache device was fully trimmed for the last
246 	 * time.
247 	 */
248 	uint64_t		dh_trim_action_time;
249 	uint64_t		dh_trim_state;
250 	const uint64_t		dh_pad[30];	/* pad to 512 bytes */
251 	zio_eck_t		dh_tail;
252 } l2arc_dev_hdr_phys_t;
253 CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
254 
255 /*
256  * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
257  */
258 typedef struct l2arc_log_ent_phys {
259 	dva_t			le_dva;		/* dva of buffer */
260 	uint64_t		le_birth;	/* birth txg of buffer */
261 	/*
262 	 * le_prop has the following format:
263 	 *	* logical size (in bytes)
264 	 *	* physical (compressed) size (in bytes)
265 	 *	* compression algorithm
266 	 *	* object type (used to restore arc_buf_contents_t)
267 	 *	* protected status (used for encryption)
268 	 *	* prefetch status (used in l2arc_read_done())
269 	 */
270 	uint64_t		le_prop;
271 	uint64_t		le_daddr;	/* buf location on l2dev */
272 	uint64_t		le_complevel;
273 	/*
274 	 * We pad the size of each entry to a power of 2 so that the size of
275 	 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
276 	 * because of the L2ARC_SET_*SIZE macros.
277 	 */
278 	const uint64_t		le_pad[2];	/* pad to 64 bytes	 */
279 } l2arc_log_ent_phys_t;
280 
281 #define	L2ARC_LOG_BLK_MAX_ENTRIES	(1022)
282 
283 /*
284  * A log block of up to 1022 ARC buffer log entries, chained into the
285  * persistent L2ARC metadata linked list. Byte order of magic determines
286  * whether 64-bit bswap of fields is necessary.
287  */
288 typedef struct l2arc_log_blk_phys {
289 	uint64_t		lb_magic;	/* L2ARC_LOG_BLK_MAGIC */
290 	/*
291 	 * There are 2 chains (headed by dh_start_lbps[2]), and this field
292 	 * points back to the previous block in this chain. We alternate
293 	 * which chain we append to, so they are time-wise and offset-wise
294 	 * interleaved, but that is an optimization rather than for
295 	 * correctness.
296 	 */
297 	l2arc_log_blkptr_t	lb_prev_lbp;	/* pointer to prev log block */
298 	/*
299 	 * Pad header section to 128 bytes
300 	 */
301 	uint64_t		lb_pad[7];
302 	/* Payload */
303 	l2arc_log_ent_phys_t	lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
304 } l2arc_log_blk_phys_t;				/* 64K total */
305 
306 /*
307  * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
308  * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
309  */
310 CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
311     1ULL << SPA_MINBLOCKSHIFT));
312 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
313 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
314 
315 /*
316  * These structures hold in-flight abd buffers for log blocks as they're being
317  * written to the L2ARC device.
318  */
319 typedef struct l2arc_lb_abd_buf {
320 	abd_t		*abd;
321 	list_node_t	node;
322 } l2arc_lb_abd_buf_t;
323 
324 /*
325  * These structures hold pointers to log blocks present on the L2ARC device.
326  */
327 typedef struct l2arc_lb_ptr_buf {
328 	l2arc_log_blkptr_t	*lb_ptr;
329 	list_node_t		node;
330 } l2arc_lb_ptr_buf_t;
331 
332 /* Macros for setting fields in le_prop and lbp_prop */
333 #define	L2BLK_GET_LSIZE(field)	\
334 	BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
335 #define	L2BLK_SET_LSIZE(field, x)	\
336 	BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
337 #define	L2BLK_GET_PSIZE(field)	\
338 	BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
339 #define	L2BLK_SET_PSIZE(field, x)	\
340 	BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
341 #define	L2BLK_GET_COMPRESS(field)	\
342 	BF64_GET((field), 32, SPA_COMPRESSBITS)
343 #define	L2BLK_SET_COMPRESS(field, x)	\
344 	BF64_SET((field), 32, SPA_COMPRESSBITS, x)
345 #define	L2BLK_GET_PREFETCH(field)	BF64_GET((field), 39, 1)
346 #define	L2BLK_SET_PREFETCH(field, x)	BF64_SET((field), 39, 1, x)
347 #define	L2BLK_GET_CHECKSUM(field)	BF64_GET((field), 40, 8)
348 #define	L2BLK_SET_CHECKSUM(field, x)	BF64_SET((field), 40, 8, x)
349 #define	L2BLK_GET_TYPE(field)		BF64_GET((field), 48, 8)
350 #define	L2BLK_SET_TYPE(field, x)	BF64_SET((field), 48, 8, x)
351 #define	L2BLK_GET_PROTECTED(field)	BF64_GET((field), 56, 1)
352 #define	L2BLK_SET_PROTECTED(field, x)	BF64_SET((field), 56, 1, x)
353 
354 #define	PTR_SWAP(x, y)		\
355 	do {			\
356 		void *tmp = (x);\
357 		x = y;		\
358 		y = tmp;	\
359 		_NOTE(CONSTCOND)\
360 	} while (0)
361 
362 #define	L2ARC_DEV_HDR_MAGIC	0x5a46534341434845LLU	/* ASCII: "ZFSCACHE" */
363 #define	L2ARC_LOG_BLK_MAGIC	0x4c4f47424c4b4844LLU	/* ASCII: "LOGBLKHD" */
364 
365 /*
366  * L2ARC Internals
367  */
368 typedef struct l2arc_dev {
369 	vdev_t			*l2ad_vdev;	/* vdev */
370 	spa_t			*l2ad_spa;	/* spa */
371 	uint64_t		l2ad_hand;	/* next write location */
372 	uint64_t		l2ad_start;	/* first addr on device */
373 	uint64_t		l2ad_end;	/* last addr on device */
374 	boolean_t		l2ad_first;	/* first sweep through */
375 	boolean_t		l2ad_writing;	/* currently writing */
376 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
377 	list_t			l2ad_buflist;	/* buffer list */
378 	list_node_t		l2ad_node;	/* device list node */
379 	zfs_refcount_t		l2ad_alloc;	/* allocated bytes */
380 	/*
381 	 * Persistence-related stuff
382 	 */
383 	l2arc_dev_hdr_phys_t	*l2ad_dev_hdr;	/* persistent device header */
384 	uint64_t		l2ad_dev_hdr_asize; /* aligned hdr size */
385 	l2arc_log_blk_phys_t	l2ad_log_blk;	/* currently open log block */
386 	int			l2ad_log_ent_idx; /* index into cur log blk */
387 	/* Number of bytes in current log block's payload */
388 	uint64_t		l2ad_log_blk_payload_asize;
389 	/*
390 	 * Offset (in bytes) of the first buffer in current log block's
391 	 * payload.
392 	 */
393 	uint64_t		l2ad_log_blk_payload_start;
394 	/* Flag indicating whether a rebuild is scheduled or is going on */
395 	boolean_t		l2ad_rebuild;
396 	boolean_t		l2ad_rebuild_cancel;
397 	boolean_t		l2ad_rebuild_began;
398 	uint64_t		l2ad_log_entries;   /* entries per log blk  */
399 	uint64_t		l2ad_evict;	 /* evicted offset in bytes */
400 	/* List of pointers to log blocks present in the L2ARC device */
401 	list_t			l2ad_lbptr_list;
402 	/*
403 	 * Aligned size of all log blocks as accounted by vdev_space_update().
404 	 */
405 	zfs_refcount_t		l2ad_lb_asize;
406 	/*
407 	 * Number of log blocks present on the device.
408 	 */
409 	zfs_refcount_t		l2ad_lb_count;
410 	boolean_t		l2ad_trim_all; /* TRIM whole device */
411 } l2arc_dev_t;
412 
413 /*
414  * Encrypted blocks will need to be stored encrypted on the L2ARC
415  * disk as they appear in the main pool. In order for this to work we
416  * need to pass around the encryption parameters so they can be used
417  * to write data to the L2ARC. This struct is only defined in the
418  * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
419  * flag set.
420  */
421 typedef struct arc_buf_hdr_crypt {
422 	abd_t			*b_rabd;	/* raw encrypted data */
423 	dmu_object_type_t	b_ot;		/* object type */
424 	uint32_t		b_ebufcnt;	/* count of encrypted buffers */
425 
426 	/* dsobj for looking up encryption key for l2arc encryption */
427 	uint64_t		b_dsobj;
428 
429 	/* encryption parameters */
430 	uint8_t			b_salt[ZIO_DATA_SALT_LEN];
431 	uint8_t			b_iv[ZIO_DATA_IV_LEN];
432 
433 	/*
434 	 * Technically this could be removed since we will always be able to
435 	 * get the mac from the bp when we need it. However, it is inconvenient
436 	 * for callers of arc code to have to pass a bp in all the time. This
437 	 * also allows us to assert that L2ARC data is properly encrypted to
438 	 * match the data in the main storage pool.
439 	 */
440 	uint8_t			b_mac[ZIO_DATA_MAC_LEN];
441 } arc_buf_hdr_crypt_t;
442 
443 typedef struct l2arc_buf_hdr {
444 	/* protected by arc_buf_hdr mutex */
445 	l2arc_dev_t		*b_dev;		/* L2ARC device */
446 	uint64_t		b_daddr;	/* disk address, offset byte */
447 	uint32_t		b_hits;
448 	list_node_t		b_l2node;
449 } l2arc_buf_hdr_t;
450 
451 typedef struct l2arc_write_callback {
452 	l2arc_dev_t	*l2wcb_dev;		/* device info */
453 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
454 	/* in-flight list of log blocks */
455 	list_t		l2wcb_abd_list;
456 } l2arc_write_callback_t;
457 
458 struct arc_buf_hdr {
459 	/* protected by hash lock */
460 	dva_t			b_dva;
461 	uint64_t		b_birth;
462 
463 	arc_buf_contents_t	b_type;
464 	uint8_t			b_complevel;
465 	uint8_t			b_reserved1; /* used for 4 byte alignment */
466 	uint16_t		b_reserved2; /* used for 4 byte alignment */
467 	arc_buf_hdr_t		*b_hash_next;
468 	arc_flags_t		b_flags;
469 
470 	/*
471 	 * This field stores the size of the data buffer after
472 	 * compression, and is set in the arc's zio completion handlers.
473 	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
474 	 *
475 	 * While the block pointers can store up to 32MB in their psize
476 	 * field, we can only store up to 32MB minus 512B. This is due
477 	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
478 	 * a field of zeros represents 512B in the bp). We can't use a
479 	 * bias of 1 since we need to reserve a psize of zero, here, to
480 	 * represent holes and embedded blocks.
481 	 *
482 	 * This isn't a problem in practice, since the maximum size of a
483 	 * buffer is limited to 16MB, so we never need to store 32MB in
484 	 * this field. Even in the upstream illumos code base, the
485 	 * maximum size of a buffer is limited to 16MB.
486 	 */
487 	uint16_t		b_psize;
488 
489 	/*
490 	 * This field stores the size of the data buffer before
491 	 * compression, and cannot change once set. It is in units
492 	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
493 	 */
494 	uint16_t		b_lsize;	/* immutable */
495 	uint64_t		b_spa;		/* immutable */
496 
497 	/* L2ARC fields. Undefined when not in L2ARC. */
498 	l2arc_buf_hdr_t		b_l2hdr;
499 	/* L1ARC fields. Undefined when in l2arc_only state */
500 	l1arc_buf_hdr_t		b_l1hdr;
501 	/*
502 	 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
503 	 * is set and the L1 header exists.
504 	 */
505 	arc_buf_hdr_crypt_t b_crypt_hdr;
506 };
507 
508 typedef struct arc_stats {
509 	kstat_named_t arcstat_hits;
510 	kstat_named_t arcstat_misses;
511 	kstat_named_t arcstat_demand_data_hits;
512 	kstat_named_t arcstat_demand_data_misses;
513 	kstat_named_t arcstat_demand_metadata_hits;
514 	kstat_named_t arcstat_demand_metadata_misses;
515 	kstat_named_t arcstat_prefetch_data_hits;
516 	kstat_named_t arcstat_prefetch_data_misses;
517 	kstat_named_t arcstat_prefetch_metadata_hits;
518 	kstat_named_t arcstat_prefetch_metadata_misses;
519 	kstat_named_t arcstat_mru_hits;
520 	kstat_named_t arcstat_mru_ghost_hits;
521 	kstat_named_t arcstat_mfu_hits;
522 	kstat_named_t arcstat_mfu_ghost_hits;
523 	kstat_named_t arcstat_deleted;
524 	/*
525 	 * Number of buffers that could not be evicted because the hash lock
526 	 * was held by another thread.  The lock may not necessarily be held
527 	 * by something using the same buffer, since hash locks are shared
528 	 * by multiple buffers.
529 	 */
530 	kstat_named_t arcstat_mutex_miss;
531 	/*
532 	 * Number of buffers skipped when updating the access state due to the
533 	 * header having already been released after acquiring the hash lock.
534 	 */
535 	kstat_named_t arcstat_access_skip;
536 	/*
537 	 * Number of buffers skipped because they have I/O in progress, are
538 	 * indirect prefetch buffers that have not lived long enough, or are
539 	 * not from the spa we're trying to evict from.
540 	 */
541 	kstat_named_t arcstat_evict_skip;
542 	/*
543 	 * Number of times arc_evict_state() was unable to evict enough
544 	 * buffers to reach its target amount.
545 	 */
546 	kstat_named_t arcstat_evict_not_enough;
547 	kstat_named_t arcstat_evict_l2_cached;
548 	kstat_named_t arcstat_evict_l2_eligible;
549 	kstat_named_t arcstat_evict_l2_ineligible;
550 	kstat_named_t arcstat_evict_l2_skip;
551 	kstat_named_t arcstat_hash_elements;
552 	kstat_named_t arcstat_hash_elements_max;
553 	kstat_named_t arcstat_hash_collisions;
554 	kstat_named_t arcstat_hash_chains;
555 	kstat_named_t arcstat_hash_chain_max;
556 	kstat_named_t arcstat_p;
557 	kstat_named_t arcstat_c;
558 	kstat_named_t arcstat_c_min;
559 	kstat_named_t arcstat_c_max;
560 	/* Not updated directly; only synced in arc_kstat_update. */
561 	kstat_named_t arcstat_size;
562 	/*
563 	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
564 	 * Note that the compressed bytes may match the uncompressed bytes
565 	 * if the block is either not compressed or compressed arc is disabled.
566 	 */
567 	kstat_named_t arcstat_compressed_size;
568 	/*
569 	 * Uncompressed size of the data stored in b_pabd. If compressed
570 	 * arc is disabled then this value will be identical to the stat
571 	 * above.
572 	 */
573 	kstat_named_t arcstat_uncompressed_size;
574 	/*
575 	 * Number of bytes stored in all the arc_buf_t's. This is classified
576 	 * as "overhead" since this data is typically short-lived and will
577 	 * be evicted from the arc when it becomes unreferenced unless the
578 	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
579 	 * values have been set (see comment in dbuf.c for more information).
580 	 */
581 	kstat_named_t arcstat_overhead_size;
582 	/*
583 	 * Number of bytes consumed by internal ARC structures necessary
584 	 * for tracking purposes; these structures are not actually
585 	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
586 	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
587 	 * caches), and arc_buf_t structures (allocated via arc_buf_t
588 	 * cache).
589 	 * Not updated directly; only synced in arc_kstat_update.
590 	 */
591 	kstat_named_t arcstat_hdr_size;
592 	/*
593 	 * Number of bytes consumed by ARC buffers of type equal to
594 	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
595 	 * on disk user data (e.g. plain file contents).
596 	 * Not updated directly; only synced in arc_kstat_update.
597 	 */
598 	kstat_named_t arcstat_data_size;
599 	/*
600 	 * Number of bytes consumed by ARC buffers of type equal to
601 	 * ARC_BUFC_METADATA. This is generally consumed by buffers
602 	 * backing on disk data that is used for internal ZFS
603 	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
604 	 * Not updated directly; only synced in arc_kstat_update.
605 	 */
606 	kstat_named_t arcstat_metadata_size;
607 	/*
608 	 * Number of bytes consumed by dmu_buf_impl_t objects.
609 	 * Not updated directly; only synced in arc_kstat_update.
610 	 */
611 	kstat_named_t arcstat_dbuf_size;
612 	/*
613 	 * Number of bytes consumed by dnode_t objects.
614 	 * Not updated directly; only synced in arc_kstat_update.
615 	 */
616 	kstat_named_t arcstat_dnode_size;
617 	/*
618 	 * Number of bytes consumed by bonus buffers.
619 	 * Not updated directly; only synced in arc_kstat_update.
620 	 */
621 	kstat_named_t arcstat_bonus_size;
622 #if defined(COMPAT_FREEBSD11)
623 	/*
624 	 * Sum of the previous three counters, provided for compatibility.
625 	 */
626 	kstat_named_t arcstat_other_size;
627 #endif
628 
629 	/*
630 	 * Total number of bytes consumed by ARC buffers residing in the
631 	 * arc_anon state. This includes *all* buffers in the arc_anon
632 	 * state; e.g. data, metadata, evictable, and unevictable buffers
633 	 * are all included in this value.
634 	 * Not updated directly; only synced in arc_kstat_update.
635 	 */
636 	kstat_named_t arcstat_anon_size;
637 	/*
638 	 * Number of bytes consumed by ARC buffers that meet the
639 	 * following criteria: backing buffers of type ARC_BUFC_DATA,
640 	 * residing in the arc_anon state, and are eligible for eviction
641 	 * (e.g. have no outstanding holds on the buffer).
642 	 * Not updated directly; only synced in arc_kstat_update.
643 	 */
644 	kstat_named_t arcstat_anon_evictable_data;
645 	/*
646 	 * Number of bytes consumed by ARC buffers that meet the
647 	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
648 	 * residing in the arc_anon state, and are eligible for eviction
649 	 * (e.g. have no outstanding holds on the buffer).
650 	 * Not updated directly; only synced in arc_kstat_update.
651 	 */
652 	kstat_named_t arcstat_anon_evictable_metadata;
653 	/*
654 	 * Total number of bytes consumed by ARC buffers residing in the
655 	 * arc_mru state. This includes *all* buffers in the arc_mru
656 	 * state; e.g. data, metadata, evictable, and unevictable buffers
657 	 * are all included in this value.
658 	 * Not updated directly; only synced in arc_kstat_update.
659 	 */
660 	kstat_named_t arcstat_mru_size;
661 	/*
662 	 * Number of bytes consumed by ARC buffers that meet the
663 	 * following criteria: backing buffers of type ARC_BUFC_DATA,
664 	 * residing in the arc_mru state, and are eligible for eviction
665 	 * (e.g. have no outstanding holds on the buffer).
666 	 * Not updated directly; only synced in arc_kstat_update.
667 	 */
668 	kstat_named_t arcstat_mru_evictable_data;
669 	/*
670 	 * Number of bytes consumed by ARC buffers that meet the
671 	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
672 	 * residing in the arc_mru state, and are eligible for eviction
673 	 * (e.g. have no outstanding holds on the buffer).
674 	 * Not updated directly; only synced in arc_kstat_update.
675 	 */
676 	kstat_named_t arcstat_mru_evictable_metadata;
677 	/*
678 	 * Total number of bytes that *would have been* consumed by ARC
679 	 * buffers in the arc_mru_ghost state. The key thing to note
680 	 * here, is the fact that this size doesn't actually indicate
681 	 * RAM consumption. The ghost lists only consist of headers and
682 	 * don't actually have ARC buffers linked off of these headers.
683 	 * Thus, *if* the headers had associated ARC buffers, these
684 	 * buffers *would have* consumed this number of bytes.
685 	 * Not updated directly; only synced in arc_kstat_update.
686 	 */
687 	kstat_named_t arcstat_mru_ghost_size;
688 	/*
689 	 * Number of bytes that *would have been* consumed by ARC
690 	 * buffers that are eligible for eviction, of type
691 	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
692 	 * Not updated directly; only synced in arc_kstat_update.
693 	 */
694 	kstat_named_t arcstat_mru_ghost_evictable_data;
695 	/*
696 	 * Number of bytes that *would have been* consumed by ARC
697 	 * buffers that are eligible for eviction, of type
698 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
699 	 * Not updated directly; only synced in arc_kstat_update.
700 	 */
701 	kstat_named_t arcstat_mru_ghost_evictable_metadata;
702 	/*
703 	 * Total number of bytes consumed by ARC buffers residing in the
704 	 * arc_mfu state. This includes *all* buffers in the arc_mfu
705 	 * state; e.g. data, metadata, evictable, and unevictable buffers
706 	 * are all included in this value.
707 	 * Not updated directly; only synced in arc_kstat_update.
708 	 */
709 	kstat_named_t arcstat_mfu_size;
710 	/*
711 	 * Number of bytes consumed by ARC buffers that are eligible for
712 	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
713 	 * state.
714 	 * Not updated directly; only synced in arc_kstat_update.
715 	 */
716 	kstat_named_t arcstat_mfu_evictable_data;
717 	/*
718 	 * Number of bytes consumed by ARC buffers that are eligible for
719 	 * eviction, of type ARC_BUFC_METADATA, and reside in the
720 	 * arc_mfu state.
721 	 * Not updated directly; only synced in arc_kstat_update.
722 	 */
723 	kstat_named_t arcstat_mfu_evictable_metadata;
724 	/*
725 	 * Total number of bytes that *would have been* consumed by ARC
726 	 * buffers in the arc_mfu_ghost state. See the comment above
727 	 * arcstat_mru_ghost_size for more details.
728 	 * Not updated directly; only synced in arc_kstat_update.
729 	 */
730 	kstat_named_t arcstat_mfu_ghost_size;
731 	/*
732 	 * Number of bytes that *would have been* consumed by ARC
733 	 * buffers that are eligible for eviction, of type
734 	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
735 	 * Not updated directly; only synced in arc_kstat_update.
736 	 */
737 	kstat_named_t arcstat_mfu_ghost_evictable_data;
738 	/*
739 	 * Number of bytes that *would have been* consumed by ARC
740 	 * buffers that are eligible for eviction, of type
741 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
742 	 * Not updated directly; only synced in arc_kstat_update.
743 	 */
744 	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
745 	kstat_named_t arcstat_l2_hits;
746 	kstat_named_t arcstat_l2_misses;
747 	kstat_named_t arcstat_l2_feeds;
748 	kstat_named_t arcstat_l2_rw_clash;
749 	kstat_named_t arcstat_l2_read_bytes;
750 	kstat_named_t arcstat_l2_write_bytes;
751 	kstat_named_t arcstat_l2_writes_sent;
752 	kstat_named_t arcstat_l2_writes_done;
753 	kstat_named_t arcstat_l2_writes_error;
754 	kstat_named_t arcstat_l2_writes_lock_retry;
755 	kstat_named_t arcstat_l2_evict_lock_retry;
756 	kstat_named_t arcstat_l2_evict_reading;
757 	kstat_named_t arcstat_l2_evict_l1cached;
758 	kstat_named_t arcstat_l2_free_on_write;
759 	kstat_named_t arcstat_l2_abort_lowmem;
760 	kstat_named_t arcstat_l2_cksum_bad;
761 	kstat_named_t arcstat_l2_io_error;
762 	kstat_named_t arcstat_l2_lsize;
763 	kstat_named_t arcstat_l2_psize;
764 	/* Not updated directly; only synced in arc_kstat_update. */
765 	kstat_named_t arcstat_l2_hdr_size;
766 	/*
767 	 * Number of L2ARC log blocks written. These are used for restoring the
768 	 * L2ARC. Updated during writing of L2ARC log blocks.
769 	 */
770 	kstat_named_t arcstat_l2_log_blk_writes;
771 	/*
772 	 * Moving average of the aligned size of the L2ARC log blocks, in
773 	 * bytes. Updated during L2ARC rebuild and during writing of L2ARC
774 	 * log blocks.
775 	 */
776 	kstat_named_t arcstat_l2_log_blk_avg_asize;
777 	/* Aligned size of L2ARC log blocks on L2ARC devices. */
778 	kstat_named_t arcstat_l2_log_blk_asize;
779 	/* Number of L2ARC log blocks present on L2ARC devices. */
780 	kstat_named_t arcstat_l2_log_blk_count;
781 	/*
782 	 * Moving average of the aligned size of L2ARC restored data, in bytes,
783 	 * to the aligned size of their metadata in L2ARC, in bytes.
784 	 * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
785 	 */
786 	kstat_named_t arcstat_l2_data_to_meta_ratio;
787 	/*
788 	 * Number of times the L2ARC rebuild was successful for an L2ARC device.
789 	 */
790 	kstat_named_t arcstat_l2_rebuild_success;
791 	/*
792 	 * Number of times the L2ARC rebuild failed because the device header
793 	 * was in an unsupported format or corrupted.
794 	 */
795 	kstat_named_t arcstat_l2_rebuild_abort_unsupported;
796 	/*
797 	 * Number of times the L2ARC rebuild failed because of IO errors
798 	 * while reading a log block.
799 	 */
800 	kstat_named_t arcstat_l2_rebuild_abort_io_errors;
801 	/*
802 	 * Number of times the L2ARC rebuild failed because of IO errors when
803 	 * reading the device header.
804 	 */
805 	kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
806 	/*
807 	 * Number of L2ARC log blocks which failed to be restored due to
808 	 * checksum errors.
809 	 */
810 	kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
811 	/*
812 	 * Number of times the L2ARC rebuild was aborted due to low system
813 	 * memory.
814 	 */
815 	kstat_named_t arcstat_l2_rebuild_abort_lowmem;
816 	/* Logical size of L2ARC restored data, in bytes. */
817 	kstat_named_t arcstat_l2_rebuild_size;
818 	/* Aligned size of L2ARC restored data, in bytes. */
819 	kstat_named_t arcstat_l2_rebuild_asize;
820 	/*
821 	 * Number of L2ARC log entries (buffers) that were successfully
822 	 * restored in ARC.
823 	 */
824 	kstat_named_t arcstat_l2_rebuild_bufs;
825 	/*
826 	 * Number of L2ARC log entries (buffers) already cached in ARC. These
827 	 * were not restored again.
828 	 */
829 	kstat_named_t arcstat_l2_rebuild_bufs_precached;
830 	/*
831 	 * Number of L2ARC log blocks that were restored successfully. Each
832 	 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
833 	 */
834 	kstat_named_t arcstat_l2_rebuild_log_blks;
835 	kstat_named_t arcstat_memory_throttle_count;
836 	kstat_named_t arcstat_memory_direct_count;
837 	kstat_named_t arcstat_memory_indirect_count;
838 	kstat_named_t arcstat_memory_all_bytes;
839 	kstat_named_t arcstat_memory_free_bytes;
840 	kstat_named_t arcstat_memory_available_bytes;
841 	kstat_named_t arcstat_no_grow;
842 	kstat_named_t arcstat_tempreserve;
843 	kstat_named_t arcstat_loaned_bytes;
844 	kstat_named_t arcstat_prune;
845 	/* Not updated directly; only synced in arc_kstat_update. */
846 	kstat_named_t arcstat_meta_used;
847 	kstat_named_t arcstat_meta_limit;
848 	kstat_named_t arcstat_dnode_limit;
849 	kstat_named_t arcstat_meta_max;
850 	kstat_named_t arcstat_meta_min;
851 	kstat_named_t arcstat_async_upgrade_sync;
852 	kstat_named_t arcstat_demand_hit_predictive_prefetch;
853 	kstat_named_t arcstat_demand_hit_prescient_prefetch;
854 	kstat_named_t arcstat_need_free;
855 	kstat_named_t arcstat_sys_free;
856 	kstat_named_t arcstat_raw_size;
857 	kstat_named_t arcstat_cached_only_in_progress;
858 	kstat_named_t arcstat_abd_chunk_waste_size;
859 } arc_stats_t;
860 
861 typedef struct arc_evict_waiter {
862 	list_node_t aew_node;
863 	kcondvar_t aew_cv;
864 	uint64_t aew_count;
865 } arc_evict_waiter_t;
866 
867 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
868 
869 #define	ARCSTAT_INCR(stat, val) \
870 	atomic_add_64(&arc_stats.stat.value.ui64, (val))
871 
872 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
873 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
874 
875 #define	arc_no_grow	ARCSTAT(arcstat_no_grow) /* do not grow cache size */
876 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
877 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
878 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
879 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
880 #define	arc_sys_free	ARCSTAT(arcstat_sys_free) /* target system free bytes */
881 
882 extern taskq_t *arc_prune_taskq;
883 extern arc_stats_t arc_stats;
884 extern hrtime_t arc_growtime;
885 extern boolean_t arc_warm;
886 extern int arc_grow_retry;
887 extern int arc_no_grow_shift;
888 extern int arc_shrink_shift;
889 extern kmutex_t arc_prune_mtx;
890 extern list_t arc_prune_list;
891 extern aggsum_t arc_size;
892 extern arc_state_t	*arc_mfu;
893 extern arc_state_t	*arc_mru;
894 extern uint_t zfs_arc_pc_percent;
895 extern int arc_lotsfree_percent;
896 extern unsigned long zfs_arc_min;
897 extern unsigned long zfs_arc_max;
898 
899 extern void arc_reduce_target_size(int64_t to_free);
900 extern boolean_t arc_reclaim_needed(void);
901 extern void arc_kmem_reap_soon(void);
902 extern boolean_t arc_is_overflowing(void);
903 extern void arc_wait_for_eviction(uint64_t);
904 
905 extern void arc_lowmem_init(void);
906 extern void arc_lowmem_fini(void);
907 extern void arc_prune_async(int64_t);
908 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg);
909 extern uint64_t arc_free_memory(void);
910 extern int64_t arc_available_memory(void);
911 extern void arc_tuning_update(boolean_t);
912 
913 extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
914 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
915 
916 /* used in zdb.c */
917 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
918     const l2arc_log_blkptr_t *lbp);
919 
920 /* used in vdev_trim.c */
921 void l2arc_dev_hdr_update(l2arc_dev_t *dev);
922 l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
923 
924 #ifdef __cplusplus
925 }
926 #endif
927 
928 #endif /* _SYS_ARC_IMPL_H */
929