1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * DVA-based Adjustable Replacement Cache
28 *
29 * While much of the theory of operation used here is
30 * based on the self-tuning, low overhead replacement cache
31 * presented by Megiddo and Modha at FAST 2003, there are some
32 * significant differences:
33 *
34 * 1. The Megiddo and Modha model assumes any page is evictable.
35 * Pages in its cache cannot be "locked" into memory. This makes
36 * the eviction algorithm simple: evict the last page in the list.
37 * This also make the performance characteristics easy to reason
38 * about. Our cache is not so simple. At any given moment, some
39 * subset of the blocks in the cache are un-evictable because we
40 * have handed out a reference to them. Blocks are only evictable
41 * when there are no external references active. This makes
42 * eviction far more problematic: we choose to evict the evictable
43 * blocks that are the "lowest" in the list.
44 *
45 * There are times when it is not possible to evict the requested
46 * space. In these circumstances we are unable to adjust the cache
47 * size. To prevent the cache growing unbounded at these times we
48 * implement a "cache throttle" that slows the flow of new data
49 * into the cache until we can make space available.
50 *
51 * 2. The Megiddo and Modha model assumes a fixed cache size.
52 * Pages are evicted when the cache is full and there is a cache
53 * miss. Our model has a variable sized cache. It grows with
54 * high use, but also tries to react to memory pressure from the
55 * operating system: decreasing its size when system memory is
56 * tight.
57 *
58 * 3. The Megiddo and Modha model assumes a fixed page size. All
59 * elements of the cache are therefor exactly the same size. So
60 * when adjusting the cache size following a cache miss, its simply
61 * a matter of choosing a single page to evict. In our model, we
62 * have variable sized cache blocks (rangeing from 512 bytes to
63 * 128K bytes). We therefor choose a set of blocks to evict to make
64 * space for a cache miss that approximates as closely as possible
65 * the space used by the new block.
66 *
67 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
68 * by N. Megiddo & D. Modha, FAST 2003
69 */
70
71 /*
72 * The locking model:
73 *
74 * A new reference to a cache buffer can be obtained in two
75 * ways: 1) via a hash table lookup using the DVA as a key,
76 * or 2) via one of the ARC lists. The arc_read() interface
77 * uses method 1, while the internal arc algorithms for
78 * adjusting the cache use method 2. We therefor provide two
79 * types of locks: 1) the hash table lock array, and 2) the
80 * arc list locks.
81 *
82 * Buffers do not have their own mutexs, rather they rely on the
83 * hash table mutexs for the bulk of their protection (i.e. most
84 * fields in the arc_buf_hdr_t are protected by these mutexs).
85 *
86 * buf_hash_find() returns the appropriate mutex (held) when it
87 * locates the requested buffer in the hash table. It returns
88 * NULL for the mutex if the buffer was not in the table.
89 *
90 * buf_hash_remove() expects the appropriate hash mutex to be
91 * already held before it is invoked.
92 *
93 * Each arc state also has a mutex which is used to protect the
94 * buffer list associated with the state. When attempting to
95 * obtain a hash table lock while holding an arc list lock you
96 * must use: mutex_tryenter() to avoid deadlock. Also note that
97 * the active state mutex must be held before the ghost state mutex.
98 *
99 * Arc buffers may have an associated eviction callback function.
100 * This function will be invoked prior to removing the buffer (e.g.
101 * in arc_do_user_evicts()). Note however that the data associated
102 * with the buffer may be evicted prior to the callback. The callback
103 * must be made with *no locks held* (to prevent deadlock). Additionally,
104 * the users of callbacks must ensure that their private data is
105 * protected from simultaneous callbacks from arc_buf_evict()
106 * and arc_do_user_evicts().
107 *
108 * Note that the majority of the performance stats are manipulated
109 * with atomic operations.
110 *
111 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
112 *
113 * - L2ARC buflist creation
114 * - L2ARC buflist eviction
115 * - L2ARC write completion, which walks L2ARC buflists
116 * - ARC header destruction, as it removes from L2ARC buflists
117 * - ARC header release, as it removes from L2ARC buflists
118 */
119
120 #include <sys/spa.h>
121 #include <sys/zio.h>
122 #include <sys/zfs_context.h>
123 #include <sys/arc.h>
124 #include <sys/refcount.h>
125 #include <sys/vdev.h>
126 #include <sys/vdev_impl.h>
127 #ifdef _KERNEL
128 #include <sys/vmsystm.h>
129 #include <vm/anon.h>
130 #include <sys/fs/swapnode.h>
131 #include <sys/dnlc.h>
132 #endif
133 #include <sys/callb.h>
134 #include <sys/kstat.h>
135 #include <zfs_fletcher.h>
136
137 #ifdef __NetBSD__
138 #include <uvm/uvm.h>
139 #ifndef btop
140 #define btop(x) ((x) / PAGE_SIZE)
141 #endif
142 #define needfree (uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0)
143 #define buf_init arc_buf_init
144 #define freemem uvmexp.free
145 #define minfree uvmexp.freemin
146 #define desfree uvmexp.freetarg
147 #define lotsfree (desfree * 2)
148 #define availrmem desfree
149 #define swapfs_minfree 0
150 #define swapfs_reserve 0
151 #undef curproc
152 #define curproc curlwp
153 #define proc_pageout uvm.pagedaemon_lwp
154
155 static void *zio_arena;
156
157 #include <sys/callback.h>
158 /* Structures used for memory and kva space reclaim. */
159 static struct callback_entry arc_kva_reclaim_entry;
160
161 #endif /* __NetBSD__ */
162
163 static kmutex_t arc_reclaim_thr_lock;
164 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
165 static uint8_t arc_thread_exit;
166
167 extern int zfs_write_limit_shift;
168 extern uint64_t zfs_write_limit_max;
169 extern kmutex_t zfs_write_limit_lock;
170
171 #define ARC_REDUCE_DNLC_PERCENT 3
172 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
173
174 typedef enum arc_reclaim_strategy {
175 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
176 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
177 } arc_reclaim_strategy_t;
178
179 /* number of seconds before growing cache again */
180 static int arc_grow_retry = 60;
181
182 /* shift of arc_c for calculating both min and max arc_p */
183 static int arc_p_min_shift = 4;
184
185 /* log2(fraction of arc to reclaim) */
186 static int arc_shrink_shift = 5;
187
188 /*
189 * minimum lifespan of a prefetch block in clock ticks
190 * (initialized in arc_init())
191 */
192 static int arc_min_prefetch_lifespan;
193
194 static int arc_dead;
195
196 /*
197 * The arc has filled available memory and has now warmed up.
198 */
199 static boolean_t arc_warm;
200
201 /*
202 * These tunables are for performance analysis.
203 */
204 uint64_t zfs_arc_max;
205 uint64_t zfs_arc_min;
206 uint64_t zfs_arc_meta_limit = 0;
207 int zfs_arc_grow_retry = 0;
208 int zfs_arc_shrink_shift = 0;
209 int zfs_arc_p_min_shift = 0;
210
211 /*
212 * Note that buffers can be in one of 6 states:
213 * ARC_anon - anonymous (discussed below)
214 * ARC_mru - recently used, currently cached
215 * ARC_mru_ghost - recentely used, no longer in cache
216 * ARC_mfu - frequently used, currently cached
217 * ARC_mfu_ghost - frequently used, no longer in cache
218 * ARC_l2c_only - exists in L2ARC but not other states
219 * When there are no active references to the buffer, they are
220 * are linked onto a list in one of these arc states. These are
221 * the only buffers that can be evicted or deleted. Within each
222 * state there are multiple lists, one for meta-data and one for
223 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
224 * etc.) is tracked separately so that it can be managed more
225 * explicitly: favored over data, limited explicitly.
226 *
227 * Anonymous buffers are buffers that are not associated with
228 * a DVA. These are buffers that hold dirty block copies
229 * before they are written to stable storage. By definition,
230 * they are "ref'd" and are considered part of arc_mru
231 * that cannot be freed. Generally, they will aquire a DVA
232 * as they are written and migrate onto the arc_mru list.
233 *
234 * The ARC_l2c_only state is for buffers that are in the second
235 * level ARC but no longer in any of the ARC_m* lists. The second
236 * level ARC itself may also contain buffers that are in any of
237 * the ARC_m* states - meaning that a buffer can exist in two
238 * places. The reason for the ARC_l2c_only state is to keep the
239 * buffer header in the hash table, so that reads that hit the
240 * second level ARC benefit from these fast lookups.
241 */
242
243 typedef struct arc_state {
244 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
245 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
246 uint64_t arcs_size; /* total amount of data in this state */
247 kmutex_t arcs_mtx;
248 } arc_state_t;
249
250 /* The 6 states: */
251 static arc_state_t ARC_anon;
252 static arc_state_t ARC_mru;
253 static arc_state_t ARC_mru_ghost;
254 static arc_state_t ARC_mfu;
255 static arc_state_t ARC_mfu_ghost;
256 static arc_state_t ARC_l2c_only;
257
258 typedef struct arc_stats {
259 kstat_named_t arcstat_hits;
260 kstat_named_t arcstat_misses;
261 kstat_named_t arcstat_demand_data_hits;
262 kstat_named_t arcstat_demand_data_misses;
263 kstat_named_t arcstat_demand_metadata_hits;
264 kstat_named_t arcstat_demand_metadata_misses;
265 kstat_named_t arcstat_prefetch_data_hits;
266 kstat_named_t arcstat_prefetch_data_misses;
267 kstat_named_t arcstat_prefetch_metadata_hits;
268 kstat_named_t arcstat_prefetch_metadata_misses;
269 kstat_named_t arcstat_mru_hits;
270 kstat_named_t arcstat_mru_ghost_hits;
271 kstat_named_t arcstat_mfu_hits;
272 kstat_named_t arcstat_mfu_ghost_hits;
273 kstat_named_t arcstat_deleted;
274 kstat_named_t arcstat_recycle_miss;
275 kstat_named_t arcstat_mutex_miss;
276 kstat_named_t arcstat_evict_skip;
277 kstat_named_t arcstat_evict_l2_cached;
278 kstat_named_t arcstat_evict_l2_eligible;
279 kstat_named_t arcstat_evict_l2_ineligible;
280 kstat_named_t arcstat_hash_elements;
281 kstat_named_t arcstat_hash_elements_max;
282 kstat_named_t arcstat_hash_collisions;
283 kstat_named_t arcstat_hash_chains;
284 kstat_named_t arcstat_hash_chain_max;
285 kstat_named_t arcstat_p;
286 kstat_named_t arcstat_c;
287 kstat_named_t arcstat_c_min;
288 kstat_named_t arcstat_c_max;
289 kstat_named_t arcstat_size;
290 kstat_named_t arcstat_hdr_size;
291 kstat_named_t arcstat_data_size;
292 kstat_named_t arcstat_other_size;
293 kstat_named_t arcstat_l2_hits;
294 kstat_named_t arcstat_l2_misses;
295 kstat_named_t arcstat_l2_feeds;
296 kstat_named_t arcstat_l2_rw_clash;
297 kstat_named_t arcstat_l2_read_bytes;
298 kstat_named_t arcstat_l2_write_bytes;
299 kstat_named_t arcstat_l2_writes_sent;
300 kstat_named_t arcstat_l2_writes_done;
301 kstat_named_t arcstat_l2_writes_error;
302 kstat_named_t arcstat_l2_writes_hdr_miss;
303 kstat_named_t arcstat_l2_evict_lock_retry;
304 kstat_named_t arcstat_l2_evict_reading;
305 kstat_named_t arcstat_l2_free_on_write;
306 kstat_named_t arcstat_l2_abort_lowmem;
307 kstat_named_t arcstat_l2_cksum_bad;
308 kstat_named_t arcstat_l2_io_error;
309 kstat_named_t arcstat_l2_size;
310 kstat_named_t arcstat_l2_hdr_size;
311 kstat_named_t arcstat_memory_throttle_count;
312 } arc_stats_t;
313
314 static arc_stats_t arc_stats = {
315 { "hits", KSTAT_DATA_UINT64 },
316 { "misses", KSTAT_DATA_UINT64 },
317 { "demand_data_hits", KSTAT_DATA_UINT64 },
318 { "demand_data_misses", KSTAT_DATA_UINT64 },
319 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
320 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
321 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
322 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
323 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
324 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
325 { "mru_hits", KSTAT_DATA_UINT64 },
326 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
327 { "mfu_hits", KSTAT_DATA_UINT64 },
328 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
329 { "deleted", KSTAT_DATA_UINT64 },
330 { "recycle_miss", KSTAT_DATA_UINT64 },
331 { "mutex_miss", KSTAT_DATA_UINT64 },
332 { "evict_skip", KSTAT_DATA_UINT64 },
333 { "evict_l2_cached", KSTAT_DATA_UINT64 },
334 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
335 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
336 { "hash_elements", KSTAT_DATA_UINT64 },
337 { "hash_elements_max", KSTAT_DATA_UINT64 },
338 { "hash_collisions", KSTAT_DATA_UINT64 },
339 { "hash_chains", KSTAT_DATA_UINT64 },
340 { "hash_chain_max", KSTAT_DATA_UINT64 },
341 { "p", KSTAT_DATA_UINT64 },
342 { "c", KSTAT_DATA_UINT64 },
343 { "c_min", KSTAT_DATA_UINT64 },
344 { "c_max", KSTAT_DATA_UINT64 },
345 { "size", KSTAT_DATA_UINT64 },
346 { "hdr_size", KSTAT_DATA_UINT64 },
347 { "data_size", KSTAT_DATA_UINT64 },
348 { "other_size", KSTAT_DATA_UINT64 },
349 { "l2_hits", KSTAT_DATA_UINT64 },
350 { "l2_misses", KSTAT_DATA_UINT64 },
351 { "l2_feeds", KSTAT_DATA_UINT64 },
352 { "l2_rw_clash", KSTAT_DATA_UINT64 },
353 { "l2_read_bytes", KSTAT_DATA_UINT64 },
354 { "l2_write_bytes", KSTAT_DATA_UINT64 },
355 { "l2_writes_sent", KSTAT_DATA_UINT64 },
356 { "l2_writes_done", KSTAT_DATA_UINT64 },
357 { "l2_writes_error", KSTAT_DATA_UINT64 },
358 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
359 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
360 { "l2_evict_reading", KSTAT_DATA_UINT64 },
361 { "l2_free_on_write", KSTAT_DATA_UINT64 },
362 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
363 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
364 { "l2_io_error", KSTAT_DATA_UINT64 },
365 { "l2_size", KSTAT_DATA_UINT64 },
366 { "l2_hdr_size", KSTAT_DATA_UINT64 },
367 { "memory_throttle_count", KSTAT_DATA_UINT64 }
368 };
369
370 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
371
372 #define ARCSTAT_INCR(stat, val) \
373 atomic_add_64(&arc_stats.stat.value.ui64, (val));
374
375 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
376 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
377
378 #define ARCSTAT_MAX(stat, val) { \
379 uint64_t m; \
380 while ((val) > (m = arc_stats.stat.value.ui64) && \
381 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
382 continue; \
383 }
384
385 #define ARCSTAT_MAXSTAT(stat) \
386 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
387
388 /*
389 * We define a macro to allow ARC hits/misses to be easily broken down by
390 * two separate conditions, giving a total of four different subtypes for
391 * each of hits and misses (so eight statistics total).
392 */
393 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
394 if (cond1) { \
395 if (cond2) { \
396 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
397 } else { \
398 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
399 } \
400 } else { \
401 if (cond2) { \
402 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
403 } else { \
404 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
405 } \
406 }
407
408 kstat_t *arc_ksp;
409 static arc_state_t *arc_anon;
410 static arc_state_t *arc_mru;
411 static arc_state_t *arc_mru_ghost;
412 static arc_state_t *arc_mfu;
413 static arc_state_t *arc_mfu_ghost;
414 static arc_state_t *arc_l2c_only;
415
416 /*
417 * There are several ARC variables that are critical to export as kstats --
418 * but we don't want to have to grovel around in the kstat whenever we wish to
419 * manipulate them. For these variables, we therefore define them to be in
420 * terms of the statistic variable. This assures that we are not introducing
421 * the possibility of inconsistency by having shadow copies of the variables,
422 * while still allowing the code to be readable.
423 */
424 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
425 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
426 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
427 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
428 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
429
430 static int arc_no_grow; /* Don't try to grow cache size */
431 static uint64_t arc_tempreserve;
432 static uint64_t arc_loaned_bytes;
433 static uint64_t arc_meta_used;
434 static uint64_t arc_meta_limit;
435 static uint64_t arc_meta_max = 0;
436
437 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
438
439 typedef struct arc_callback arc_callback_t;
440
441 struct arc_callback {
442 void *acb_private;
443 arc_done_func_t *acb_done;
444 arc_buf_t *acb_buf;
445 zio_t *acb_zio_dummy;
446 arc_callback_t *acb_next;
447 };
448
449 typedef struct arc_write_callback arc_write_callback_t;
450
451 struct arc_write_callback {
452 void *awcb_private;
453 arc_done_func_t *awcb_ready;
454 arc_done_func_t *awcb_done;
455 arc_buf_t *awcb_buf;
456 };
457
458 struct arc_buf_hdr {
459 /* protected by hash lock */
460 dva_t b_dva;
461 uint64_t b_birth;
462 uint64_t b_cksum0;
463
464 kmutex_t b_freeze_lock;
465 zio_cksum_t *b_freeze_cksum;
466
467 arc_buf_hdr_t *b_hash_next;
468 arc_buf_t *b_buf;
469 uint32_t b_flags;
470 uint32_t b_datacnt;
471
472 arc_callback_t *b_acb;
473 kcondvar_t b_cv;
474
475 /* immutable */
476 arc_buf_contents_t b_type;
477 uint64_t b_size;
478 uint64_t b_spa;
479
480 /* protected by arc state mutex */
481 arc_state_t *b_state;
482 list_node_t b_arc_node;
483
484 /* updated atomically */
485 clock_t b_arc_access;
486
487 /* self protecting */
488 refcount_t b_refcnt;
489
490 l2arc_buf_hdr_t *b_l2hdr;
491 list_node_t b_l2node;
492 };
493
494 static arc_buf_t *arc_eviction_list;
495 static kmutex_t arc_eviction_mtx;
496 static arc_buf_hdr_t arc_eviction_hdr;
497 static void arc_get_data_buf(arc_buf_t *buf);
498 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
499 static int arc_evict_needed(arc_buf_contents_t type);
500 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
501
502 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
503
504 #define GHOST_STATE(state) \
505 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
506 (state) == arc_l2c_only)
507
508 /*
509 * Private ARC flags. These flags are private ARC only flags that will show up
510 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
511 * be passed in as arc_flags in things like arc_read. However, these flags
512 * should never be passed and should only be set by ARC code. When adding new
513 * public flags, make sure not to smash the private ones.
514 */
515
516 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
517 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
518 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
519 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
520 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
521 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
522 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
523 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
524 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
525 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
526
527 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
528 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
529 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
530 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
531 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
532 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
533 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
534 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
535 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
536 (hdr)->b_l2hdr != NULL)
537 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
538 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
539 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
540
541 /*
542 * Other sizes
543 */
544
545 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
546 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
547
548 /*
549 * Hash table routines
550 */
551
552 #define HT_LOCK_PAD 64
553
554 struct ht_lock {
555 kmutex_t ht_lock;
556 #ifdef _KERNEL
557 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
558 #endif
559 };
560
561 #define BUF_LOCKS 256
562 typedef struct buf_hash_table {
563 uint64_t ht_mask;
564 arc_buf_hdr_t **ht_table;
565 struct ht_lock ht_locks[BUF_LOCKS];
566 } buf_hash_table_t;
567
568 static buf_hash_table_t buf_hash_table;
569
570 #define BUF_HASH_INDEX(spa, dva, birth) \
571 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
572 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
573 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
574 #define HDR_LOCK(buf) \
575 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
576
577 uint64_t zfs_crc64_table[256];
578
579 /*
580 * Level 2 ARC
581 */
582
583 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
584 #define L2ARC_HEADROOM 2 /* num of writes */
585 #define L2ARC_FEED_SECS 1 /* caching interval secs */
586 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
587
588 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
589 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
590
591 /*
592 * L2ARC Performance Tunables
593 */
594 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
595 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
596 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
597 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
598 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
599 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
600 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
601 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
602
603 /*
604 * L2ARC Internals
605 */
606 typedef struct l2arc_dev {
607 vdev_t *l2ad_vdev; /* vdev */
608 spa_t *l2ad_spa; /* spa */
609 uint64_t l2ad_hand; /* next write location */
610 uint64_t l2ad_write; /* desired write size, bytes */
611 uint64_t l2ad_boost; /* warmup write boost, bytes */
612 uint64_t l2ad_start; /* first addr on device */
613 uint64_t l2ad_end; /* last addr on device */
614 uint64_t l2ad_evict; /* last addr eviction reached */
615 boolean_t l2ad_first; /* first sweep through */
616 boolean_t l2ad_writing; /* currently writing */
617 list_t *l2ad_buflist; /* buffer list */
618 list_node_t l2ad_node; /* device list node */
619 } l2arc_dev_t;
620
621 static list_t L2ARC_dev_list; /* device list */
622 static list_t *l2arc_dev_list; /* device list pointer */
623 static kmutex_t l2arc_dev_mtx; /* device list mutex */
624 static l2arc_dev_t *l2arc_dev_last; /* last device used */
625 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
626 static list_t L2ARC_free_on_write; /* free after write buf list */
627 static list_t *l2arc_free_on_write; /* free after write list ptr */
628 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
629 static uint64_t l2arc_ndev; /* number of devices */
630
631 typedef struct l2arc_read_callback {
632 arc_buf_t *l2rcb_buf; /* read buffer */
633 spa_t *l2rcb_spa; /* spa */
634 blkptr_t l2rcb_bp; /* original blkptr */
635 zbookmark_t l2rcb_zb; /* original bookmark */
636 int l2rcb_flags; /* original flags */
637 } l2arc_read_callback_t;
638
639 typedef struct l2arc_write_callback {
640 l2arc_dev_t *l2wcb_dev; /* device info */
641 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
642 } l2arc_write_callback_t;
643
644 struct l2arc_buf_hdr {
645 /* protected by arc_buf_hdr mutex */
646 l2arc_dev_t *b_dev; /* L2ARC device */
647 uint64_t b_daddr; /* disk address, offset byte */
648 };
649
650 typedef struct l2arc_data_free {
651 /* protected by l2arc_free_on_write_mtx */
652 void *l2df_data;
653 size_t l2df_size;
654 void (*l2df_func)(void *, size_t);
655 list_node_t l2df_list_node;
656 } l2arc_data_free_t;
657
658 static kmutex_t l2arc_feed_thr_lock;
659 static kcondvar_t l2arc_feed_thr_cv;
660 static uint8_t l2arc_thread_exit;
661
662 static void l2arc_read_done(zio_t *zio);
663 static void l2arc_hdr_stat_add(void);
664 static void l2arc_hdr_stat_remove(void);
665
666 static uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)667 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
668 {
669 uint8_t *vdva = (uint8_t *)dva;
670 uint64_t crc = -1ULL;
671 int i;
672
673 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
674
675 for (i = 0; i < sizeof (dva_t); i++)
676 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
677
678 crc ^= (spa>>8) ^ birth;
679
680 return (crc);
681 }
682
683 #define BUF_EMPTY(buf) \
684 ((buf)->b_dva.dva_word[0] == 0 && \
685 (buf)->b_dva.dva_word[1] == 0 && \
686 (buf)->b_birth == 0)
687
688 #define BUF_EQUAL(spa, dva, birth, buf) \
689 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
690 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
691 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
692
693 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const dva_t * dva,uint64_t birth,kmutex_t ** lockp)694 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
695 {
696 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
697 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
698 arc_buf_hdr_t *buf;
699
700 mutex_enter(hash_lock);
701 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
702 buf = buf->b_hash_next) {
703 if (BUF_EQUAL(spa, dva, birth, buf)) {
704 *lockp = hash_lock;
705 return (buf);
706 }
707 }
708 mutex_exit(hash_lock);
709 *lockp = NULL;
710 return (NULL);
711 }
712
713 /*
714 * Insert an entry into the hash table. If there is already an element
715 * equal to elem in the hash table, then the already existing element
716 * will be returned and the new element will not be inserted.
717 * Otherwise returns NULL.
718 */
719 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * buf,kmutex_t ** lockp)720 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
721 {
722 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
723 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
724 arc_buf_hdr_t *fbuf;
725 uint32_t i;
726
727 ASSERT(!HDR_IN_HASH_TABLE(buf));
728 *lockp = hash_lock;
729 mutex_enter(hash_lock);
730 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
731 fbuf = fbuf->b_hash_next, i++) {
732 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
733 return (fbuf);
734 }
735
736 buf->b_hash_next = buf_hash_table.ht_table[idx];
737 buf_hash_table.ht_table[idx] = buf;
738 buf->b_flags |= ARC_IN_HASH_TABLE;
739
740 /* collect some hash table performance data */
741 if (i > 0) {
742 ARCSTAT_BUMP(arcstat_hash_collisions);
743 if (i == 1)
744 ARCSTAT_BUMP(arcstat_hash_chains);
745
746 ARCSTAT_MAX(arcstat_hash_chain_max, i);
747 }
748
749 ARCSTAT_BUMP(arcstat_hash_elements);
750 ARCSTAT_MAXSTAT(arcstat_hash_elements);
751
752 return (NULL);
753 }
754
755 static void
buf_hash_remove(arc_buf_hdr_t * buf)756 buf_hash_remove(arc_buf_hdr_t *buf)
757 {
758 arc_buf_hdr_t *fbuf, **bufp;
759 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
760
761 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
762 ASSERT(HDR_IN_HASH_TABLE(buf));
763
764 bufp = &buf_hash_table.ht_table[idx];
765 while ((fbuf = *bufp) != buf) {
766 ASSERT(fbuf != NULL);
767 bufp = &fbuf->b_hash_next;
768 }
769 *bufp = buf->b_hash_next;
770 buf->b_hash_next = NULL;
771 buf->b_flags &= ~ARC_IN_HASH_TABLE;
772
773 /* collect some hash table performance data */
774 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
775
776 if (buf_hash_table.ht_table[idx] &&
777 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
778 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
779 }
780
781 /*
782 * Global data structures and functions for the buf kmem cache.
783 */
784 static kmem_cache_t *hdr_cache;
785 static kmem_cache_t *buf_cache;
786
787 static void
buf_fini(void)788 buf_fini(void)
789 {
790 int i;
791
792 kmem_free(buf_hash_table.ht_table,
793 (buf_hash_table.ht_mask + 1) * sizeof (void *));
794 for (i = 0; i < BUF_LOCKS; i++)
795 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
796 kmem_cache_destroy(hdr_cache);
797 kmem_cache_destroy(buf_cache);
798 }
799
800 /*
801 * Constructor callback - called when the cache is empty
802 * and a new buf is requested.
803 */
804 /* ARGSUSED */
805 static int
hdr_cons(void * vbuf,void * unused,int kmflag)806 hdr_cons(void *vbuf, void *unused, int kmflag)
807 {
808 arc_buf_hdr_t *buf = unused;
809
810 bzero(buf, sizeof (arc_buf_hdr_t));
811 refcount_create(&buf->b_refcnt);
812 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
813 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
814 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
815
816 return (0);
817 }
818
819 /* ARGSUSED */
820 static int
buf_cons(void * vbuf,void * unused,int kmflag)821 buf_cons(void *vbuf, void *unused, int kmflag)
822 {
823 arc_buf_t *buf = unused;
824
825 bzero(buf, sizeof (arc_buf_t));
826 rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
827 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
828
829 return (0);
830 }
831
832 /*
833 * Destructor callback - called when a cached buf is
834 * no longer required.
835 */
836 /* ARGSUSED */
837 static void
hdr_dest(void * vbuf,void * unused)838 hdr_dest(void *vbuf, void *unused)
839 {
840 arc_buf_hdr_t *buf = unused;
841
842 ASSERT(BUF_EMPTY(buf));
843 refcount_destroy(&buf->b_refcnt);
844 cv_destroy(&buf->b_cv);
845 mutex_destroy(&buf->b_freeze_lock);
846 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
847 }
848
849 /* ARGSUSED */
850 static void
buf_dest(void * vbuf,void * unused)851 buf_dest(void *vbuf, void *unused)
852 {
853 arc_buf_t *buf = unused;
854
855 rw_destroy(&buf->b_lock);
856 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
857 }
858
859 /*
860 * Reclaim callback -- invoked when memory is low.
861 */
862 /* ARGSUSED */
863 static void
hdr_recl(void * unused)864 hdr_recl(void *unused)
865 {
866 dprintf("hdr_recl called\n");
867 /*
868 * umem calls the reclaim func when we destroy the buf cache,
869 * which is after we do arc_fini().
870 */
871 if (!arc_dead)
872 cv_signal(&arc_reclaim_thr_cv);
873 }
874
875 static void
buf_init(void)876 buf_init(void)
877 {
878 uint64_t *ct;
879 uint64_t hsize = 1ULL << 12;
880 int i, j;
881
882 /*
883 * The hash table is big enough to fill all of physical memory
884 * with an average 64K block size. The table will take up
885 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
886 */
887 while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
888 hsize <<= 1;
889 retry:
890 buf_hash_table.ht_mask = hsize - 1;
891 buf_hash_table.ht_table =
892 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
893 if (buf_hash_table.ht_table == NULL) {
894 ASSERT(hsize > (1ULL << 8));
895 hsize >>= 1;
896 goto retry;
897 }
898
899 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
900 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
901 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
902 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
903
904 for (i = 0; i < 256; i++)
905 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
906 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
907
908 for (i = 0; i < BUF_LOCKS; i++) {
909 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
910 NULL, MUTEX_DEFAULT, NULL);
911 }
912 }
913
914 #define ARC_MINTIME (hz>>4) /* 62 ms */
915
916 static void
arc_cksum_verify(arc_buf_t * buf)917 arc_cksum_verify(arc_buf_t *buf)
918 {
919 zio_cksum_t zc;
920
921 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
922 return;
923
924 mutex_enter(&buf->b_hdr->b_freeze_lock);
925 if (buf->b_hdr->b_freeze_cksum == NULL ||
926 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
927 mutex_exit(&buf->b_hdr->b_freeze_lock);
928 return;
929 }
930 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
931 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
932 panic("buffer modified while frozen!");
933 mutex_exit(&buf->b_hdr->b_freeze_lock);
934 }
935
936 static int
arc_cksum_equal(arc_buf_t * buf)937 arc_cksum_equal(arc_buf_t *buf)
938 {
939 zio_cksum_t zc;
940 int equal;
941
942 mutex_enter(&buf->b_hdr->b_freeze_lock);
943 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
944 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
945 mutex_exit(&buf->b_hdr->b_freeze_lock);
946
947 return (equal);
948 }
949
950 static void
arc_cksum_compute(arc_buf_t * buf,boolean_t force)951 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
952 {
953 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
954 return;
955
956 mutex_enter(&buf->b_hdr->b_freeze_lock);
957 if (buf->b_hdr->b_freeze_cksum != NULL) {
958 mutex_exit(&buf->b_hdr->b_freeze_lock);
959 return;
960 }
961 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
962 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
963 buf->b_hdr->b_freeze_cksum);
964 mutex_exit(&buf->b_hdr->b_freeze_lock);
965 }
966
967 void
arc_buf_thaw(arc_buf_t * buf)968 arc_buf_thaw(arc_buf_t *buf)
969 {
970 if (zfs_flags & ZFS_DEBUG_MODIFY) {
971 if (buf->b_hdr->b_state != arc_anon)
972 panic("modifying non-anon buffer!");
973 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
974 panic("modifying buffer while i/o in progress!");
975 arc_cksum_verify(buf);
976 }
977
978 mutex_enter(&buf->b_hdr->b_freeze_lock);
979 if (buf->b_hdr->b_freeze_cksum != NULL) {
980 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
981 buf->b_hdr->b_freeze_cksum = NULL;
982 }
983 mutex_exit(&buf->b_hdr->b_freeze_lock);
984 }
985
986 void
arc_buf_freeze(arc_buf_t * buf)987 arc_buf_freeze(arc_buf_t *buf)
988 {
989 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
990 return;
991
992 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
993 buf->b_hdr->b_state == arc_anon);
994 arc_cksum_compute(buf, B_FALSE);
995 }
996
997 static void
add_reference(arc_buf_hdr_t * ab,kmutex_t * hash_lock,void * tag)998 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
999 {
1000 ASSERT(MUTEX_HELD(hash_lock));
1001
1002 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1003 (ab->b_state != arc_anon)) {
1004 uint64_t delta = ab->b_size * ab->b_datacnt;
1005 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1006 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1007
1008 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1009 mutex_enter(&ab->b_state->arcs_mtx);
1010 ASSERT(list_link_active(&ab->b_arc_node));
1011 list_remove(list, ab);
1012 if (GHOST_STATE(ab->b_state)) {
1013 ASSERT3U(ab->b_datacnt, ==, 0);
1014 ASSERT3P(ab->b_buf, ==, NULL);
1015 delta = ab->b_size;
1016 }
1017 ASSERT(delta > 0);
1018 ASSERT3U(*size, >=, delta);
1019 atomic_add_64(size, -delta);
1020 mutex_exit(&ab->b_state->arcs_mtx);
1021 /* remove the prefetch flag if we get a reference */
1022 if (ab->b_flags & ARC_PREFETCH)
1023 ab->b_flags &= ~ARC_PREFETCH;
1024 }
1025 }
1026
1027 static int
remove_reference(arc_buf_hdr_t * ab,kmutex_t * hash_lock,void * tag)1028 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1029 {
1030 int cnt;
1031 arc_state_t *state = ab->b_state;
1032
1033 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1034 ASSERT(!GHOST_STATE(state));
1035
1036 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1037 (state != arc_anon)) {
1038 uint64_t *size = &state->arcs_lsize[ab->b_type];
1039
1040 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1041 mutex_enter(&state->arcs_mtx);
1042 ASSERT(!list_link_active(&ab->b_arc_node));
1043 list_insert_head(&state->arcs_list[ab->b_type], ab);
1044 ASSERT(ab->b_datacnt > 0);
1045 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1046 mutex_exit(&state->arcs_mtx);
1047 }
1048 return (cnt);
1049 }
1050
1051 /*
1052 * Move the supplied buffer to the indicated state. The mutex
1053 * for the buffer must be held by the caller.
1054 */
1055 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * ab,kmutex_t * hash_lock)1056 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1057 {
1058 arc_state_t *old_state = ab->b_state;
1059 int64_t refcnt = refcount_count(&ab->b_refcnt);
1060 uint64_t from_delta, to_delta;
1061
1062 ASSERT(MUTEX_HELD(hash_lock));
1063 ASSERT(new_state != old_state);
1064 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1065 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1066 ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
1067 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1068
1069 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1070
1071 /*
1072 * If this buffer is evictable, transfer it from the
1073 * old state list to the new state list.
1074 */
1075 if (refcnt == 0) {
1076 if (old_state != arc_anon) {
1077 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1078 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1079
1080 if (use_mutex)
1081 mutex_enter(&old_state->arcs_mtx);
1082
1083 ASSERT(list_link_active(&ab->b_arc_node));
1084 list_remove(&old_state->arcs_list[ab->b_type], ab);
1085
1086 /*
1087 * If prefetching out of the ghost cache,
1088 * we will have a non-null datacnt.
1089 */
1090 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1091 /* ghost elements have a ghost size */
1092 ASSERT(ab->b_buf == NULL);
1093 from_delta = ab->b_size;
1094 }
1095 ASSERT3U(*size, >=, from_delta);
1096 atomic_add_64(size, -from_delta);
1097
1098 if (use_mutex)
1099 mutex_exit(&old_state->arcs_mtx);
1100 }
1101 if (new_state != arc_anon) {
1102 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1103 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1104
1105 if (use_mutex)
1106 mutex_enter(&new_state->arcs_mtx);
1107
1108 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1109
1110 /* ghost elements have a ghost size */
1111 if (GHOST_STATE(new_state)) {
1112 ASSERT(ab->b_datacnt == 0);
1113 ASSERT(ab->b_buf == NULL);
1114 to_delta = ab->b_size;
1115 }
1116 atomic_add_64(size, to_delta);
1117
1118 if (use_mutex)
1119 mutex_exit(&new_state->arcs_mtx);
1120 }
1121 }
1122
1123 ASSERT(!BUF_EMPTY(ab));
1124 if (new_state == arc_anon) {
1125 buf_hash_remove(ab);
1126 }
1127
1128 /* adjust state sizes */
1129 if (to_delta)
1130 atomic_add_64(&new_state->arcs_size, to_delta);
1131 if (from_delta) {
1132 ASSERT3U(old_state->arcs_size, >=, from_delta);
1133 atomic_add_64(&old_state->arcs_size, -from_delta);
1134 }
1135 ab->b_state = new_state;
1136
1137 /* adjust l2arc hdr stats */
1138 if (new_state == arc_l2c_only)
1139 l2arc_hdr_stat_add();
1140 else if (old_state == arc_l2c_only)
1141 l2arc_hdr_stat_remove();
1142 }
1143
1144 void
arc_space_consume(uint64_t space,arc_space_type_t type)1145 arc_space_consume(uint64_t space, arc_space_type_t type)
1146 {
1147 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1148
1149 switch (type) {
1150 case ARC_SPACE_DATA:
1151 ARCSTAT_INCR(arcstat_data_size, space);
1152 break;
1153 case ARC_SPACE_OTHER:
1154 ARCSTAT_INCR(arcstat_other_size, space);
1155 break;
1156 case ARC_SPACE_HDRS:
1157 ARCSTAT_INCR(arcstat_hdr_size, space);
1158 break;
1159 case ARC_SPACE_L2HDRS:
1160 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1161 break;
1162 }
1163
1164 atomic_add_64(&arc_meta_used, space);
1165 atomic_add_64(&arc_size, space);
1166 }
1167
1168 void
arc_space_return(uint64_t space,arc_space_type_t type)1169 arc_space_return(uint64_t space, arc_space_type_t type)
1170 {
1171 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1172
1173 switch (type) {
1174 case ARC_SPACE_DATA:
1175 ARCSTAT_INCR(arcstat_data_size, -space);
1176 break;
1177 case ARC_SPACE_OTHER:
1178 ARCSTAT_INCR(arcstat_other_size, -space);
1179 break;
1180 case ARC_SPACE_HDRS:
1181 ARCSTAT_INCR(arcstat_hdr_size, -space);
1182 break;
1183 case ARC_SPACE_L2HDRS:
1184 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1185 break;
1186 }
1187
1188 ASSERT(arc_meta_used >= space);
1189 if (arc_meta_max < arc_meta_used)
1190 arc_meta_max = arc_meta_used;
1191 atomic_add_64(&arc_meta_used, -space);
1192 ASSERT(arc_size >= space);
1193 atomic_add_64(&arc_size, -space);
1194 }
1195
1196 void *
arc_data_buf_alloc(uint64_t size)1197 arc_data_buf_alloc(uint64_t size)
1198 {
1199 if (arc_evict_needed(ARC_BUFC_DATA))
1200 cv_signal(&arc_reclaim_thr_cv);
1201 atomic_add_64(&arc_size, size);
1202 return (zio_data_buf_alloc(size));
1203 }
1204
1205 void
arc_data_buf_free(void * buf,uint64_t size)1206 arc_data_buf_free(void *buf, uint64_t size)
1207 {
1208 zio_data_buf_free(buf, size);
1209 ASSERT(arc_size >= size);
1210 atomic_add_64(&arc_size, -size);
1211 }
1212
1213 arc_buf_t *
arc_buf_alloc(spa_t * spa,int size,void * tag,arc_buf_contents_t type)1214 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1215 {
1216 arc_buf_hdr_t *hdr;
1217 arc_buf_t *buf;
1218
1219 ASSERT3U(size, >, 0);
1220 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1221 ASSERT(BUF_EMPTY(hdr));
1222 hdr->b_size = size;
1223 hdr->b_type = type;
1224 hdr->b_spa = spa_guid(spa);
1225 hdr->b_state = arc_anon;
1226 hdr->b_arc_access = 0;
1227 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1228 buf->b_hdr = hdr;
1229 buf->b_data = NULL;
1230 buf->b_efunc = NULL;
1231 buf->b_private = NULL;
1232 buf->b_next = NULL;
1233 hdr->b_buf = buf;
1234 arc_get_data_buf(buf);
1235 hdr->b_datacnt = 1;
1236 hdr->b_flags = 0;
1237 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1238 (void) refcount_add(&hdr->b_refcnt, tag);
1239
1240 return (buf);
1241 }
1242
1243 static char *arc_onloan_tag = "onloan";
1244
1245 /*
1246 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1247 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1248 * buffers must be returned to the arc before they can be used by the DMU or
1249 * freed.
1250 */
1251 arc_buf_t *
arc_loan_buf(spa_t * spa,int size)1252 arc_loan_buf(spa_t *spa, int size)
1253 {
1254 arc_buf_t *buf;
1255
1256 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1257
1258 atomic_add_64(&arc_loaned_bytes, size);
1259 return (buf);
1260 }
1261
1262 /*
1263 * Return a loaned arc buffer to the arc.
1264 */
1265 void
arc_return_buf(arc_buf_t * buf,void * tag)1266 arc_return_buf(arc_buf_t *buf, void *tag)
1267 {
1268 arc_buf_hdr_t *hdr = buf->b_hdr;
1269
1270 ASSERT(buf->b_data != NULL);
1271 (void) refcount_add(&hdr->b_refcnt, tag);
1272 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1273
1274 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1275 }
1276
1277 /* Detach an arc_buf from a dbuf (tag) */
1278 void
arc_loan_inuse_buf(arc_buf_t * buf,void * tag)1279 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1280 {
1281 arc_buf_hdr_t *hdr;
1282
1283 rw_enter(&buf->b_lock, RW_WRITER);
1284 ASSERT(buf->b_data != NULL);
1285 hdr = buf->b_hdr;
1286 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1287 (void) refcount_remove(&hdr->b_refcnt, tag);
1288 buf->b_efunc = NULL;
1289 buf->b_private = NULL;
1290
1291 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1292 rw_exit(&buf->b_lock);
1293 }
1294
1295 static arc_buf_t *
arc_buf_clone(arc_buf_t * from)1296 arc_buf_clone(arc_buf_t *from)
1297 {
1298 arc_buf_t *buf;
1299 arc_buf_hdr_t *hdr = from->b_hdr;
1300 uint64_t size = hdr->b_size;
1301
1302 ASSERT(hdr->b_state != arc_anon);
1303
1304 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1305 buf->b_hdr = hdr;
1306 buf->b_data = NULL;
1307 buf->b_efunc = NULL;
1308 buf->b_private = NULL;
1309 buf->b_next = hdr->b_buf;
1310 hdr->b_buf = buf;
1311 arc_get_data_buf(buf);
1312 bcopy(from->b_data, buf->b_data, size);
1313 hdr->b_datacnt += 1;
1314 return (buf);
1315 }
1316
1317 void
arc_buf_add_ref(arc_buf_t * buf,void * tag)1318 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1319 {
1320 arc_buf_hdr_t *hdr;
1321 kmutex_t *hash_lock;
1322
1323 /*
1324 * Check to see if this buffer is evicted. Callers
1325 * must verify b_data != NULL to know if the add_ref
1326 * was successful.
1327 */
1328 rw_enter(&buf->b_lock, RW_READER);
1329 if (buf->b_data == NULL) {
1330 rw_exit(&buf->b_lock);
1331 return;
1332 }
1333 hdr = buf->b_hdr;
1334 ASSERT(hdr != NULL);
1335 hash_lock = HDR_LOCK(hdr);
1336 mutex_enter(hash_lock);
1337 rw_exit(&buf->b_lock);
1338
1339 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1340 add_reference(hdr, hash_lock, tag);
1341 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1342 arc_access(hdr, hash_lock);
1343 mutex_exit(hash_lock);
1344 ARCSTAT_BUMP(arcstat_hits);
1345 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1346 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1347 data, metadata, hits);
1348 }
1349
1350 /*
1351 * Free the arc data buffer. If it is an l2arc write in progress,
1352 * the buffer is placed on l2arc_free_on_write to be freed later.
1353 */
1354 static void
arc_buf_data_free(arc_buf_hdr_t * hdr,void (* free_func)(void *,size_t),void * data,size_t size)1355 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1356 void *data, size_t size)
1357 {
1358 if (HDR_L2_WRITING(hdr)) {
1359 l2arc_data_free_t *df;
1360 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1361 df->l2df_data = data;
1362 df->l2df_size = size;
1363 df->l2df_func = free_func;
1364 mutex_enter(&l2arc_free_on_write_mtx);
1365 list_insert_head(l2arc_free_on_write, df);
1366 mutex_exit(&l2arc_free_on_write_mtx);
1367 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1368 } else {
1369 free_func(data, size);
1370 }
1371 }
1372
1373 static void
arc_buf_destroy(arc_buf_t * buf,boolean_t recycle,boolean_t all)1374 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1375 {
1376 arc_buf_t **bufp;
1377
1378 /* free up data associated with the buf */
1379 if (buf->b_data) {
1380 arc_state_t *state = buf->b_hdr->b_state;
1381 uint64_t size = buf->b_hdr->b_size;
1382 arc_buf_contents_t type = buf->b_hdr->b_type;
1383
1384 arc_cksum_verify(buf);
1385
1386 if (!recycle) {
1387 if (type == ARC_BUFC_METADATA) {
1388 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1389 buf->b_data, size);
1390 arc_space_return(size, ARC_SPACE_DATA);
1391 } else {
1392 ASSERT(type == ARC_BUFC_DATA);
1393 arc_buf_data_free(buf->b_hdr,
1394 zio_data_buf_free, buf->b_data, size);
1395 ARCSTAT_INCR(arcstat_data_size, -size);
1396 atomic_add_64(&arc_size, -size);
1397 }
1398 }
1399 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1400 uint64_t *cnt = &state->arcs_lsize[type];
1401
1402 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1403 ASSERT(state != arc_anon);
1404
1405 ASSERT3U(*cnt, >=, size);
1406 atomic_add_64(cnt, -size);
1407 }
1408 ASSERT3U(state->arcs_size, >=, size);
1409 atomic_add_64(&state->arcs_size, -size);
1410 buf->b_data = NULL;
1411 ASSERT(buf->b_hdr->b_datacnt > 0);
1412 buf->b_hdr->b_datacnt -= 1;
1413 }
1414
1415 /* only remove the buf if requested */
1416 if (!all)
1417 return;
1418
1419 /* remove the buf from the hdr list */
1420 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1421 continue;
1422 *bufp = buf->b_next;
1423
1424 ASSERT(buf->b_efunc == NULL);
1425
1426 /* clean up the buf */
1427 buf->b_hdr = NULL;
1428 kmem_cache_free(buf_cache, buf);
1429 }
1430
1431 static void
arc_hdr_destroy(arc_buf_hdr_t * hdr)1432 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1433 {
1434 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1435 ASSERT3P(hdr->b_state, ==, arc_anon);
1436 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1437 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1438
1439 if (l2hdr != NULL) {
1440 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1441 /*
1442 * To prevent arc_free() and l2arc_evict() from
1443 * attempting to free the same buffer at the same time,
1444 * a FREE_IN_PROGRESS flag is given to arc_free() to
1445 * give it priority. l2arc_evict() can't destroy this
1446 * header while we are waiting on l2arc_buflist_mtx.
1447 *
1448 * The hdr may be removed from l2ad_buflist before we
1449 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1450 */
1451 if (!buflist_held) {
1452 mutex_enter(&l2arc_buflist_mtx);
1453 l2hdr = hdr->b_l2hdr;
1454 }
1455
1456 if (l2hdr != NULL) {
1457 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1458 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1459 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1460 if (hdr->b_state == arc_l2c_only)
1461 l2arc_hdr_stat_remove();
1462 hdr->b_l2hdr = NULL;
1463 }
1464
1465 if (!buflist_held)
1466 mutex_exit(&l2arc_buflist_mtx);
1467 }
1468
1469 if (!BUF_EMPTY(hdr)) {
1470 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1471 bzero(&hdr->b_dva, sizeof (dva_t));
1472 hdr->b_birth = 0;
1473 hdr->b_cksum0 = 0;
1474 }
1475 while (hdr->b_buf) {
1476 arc_buf_t *buf = hdr->b_buf;
1477
1478 if (buf->b_efunc) {
1479 mutex_enter(&arc_eviction_mtx);
1480 rw_enter(&buf->b_lock, RW_WRITER);
1481 ASSERT(buf->b_hdr != NULL);
1482 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1483 hdr->b_buf = buf->b_next;
1484 buf->b_hdr = &arc_eviction_hdr;
1485 buf->b_next = arc_eviction_list;
1486 arc_eviction_list = buf;
1487 rw_exit(&buf->b_lock);
1488 mutex_exit(&arc_eviction_mtx);
1489 } else {
1490 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1491 }
1492 }
1493 if (hdr->b_freeze_cksum != NULL) {
1494 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1495 hdr->b_freeze_cksum = NULL;
1496 }
1497
1498 ASSERT(!list_link_active(&hdr->b_arc_node));
1499 ASSERT3P(hdr->b_hash_next, ==, NULL);
1500 ASSERT3P(hdr->b_acb, ==, NULL);
1501 kmem_cache_free(hdr_cache, hdr);
1502 }
1503
1504 void
arc_buf_free(arc_buf_t * buf,void * tag)1505 arc_buf_free(arc_buf_t *buf, void *tag)
1506 {
1507 arc_buf_hdr_t *hdr = buf->b_hdr;
1508 int hashed = hdr->b_state != arc_anon;
1509
1510 ASSERT(buf->b_efunc == NULL);
1511 ASSERT(buf->b_data != NULL);
1512
1513 if (hashed) {
1514 kmutex_t *hash_lock = HDR_LOCK(hdr);
1515
1516 mutex_enter(hash_lock);
1517 (void) remove_reference(hdr, hash_lock, tag);
1518 if (hdr->b_datacnt > 1) {
1519 arc_buf_destroy(buf, FALSE, TRUE);
1520 } else {
1521 ASSERT(buf == hdr->b_buf);
1522 ASSERT(buf->b_efunc == NULL);
1523 hdr->b_flags |= ARC_BUF_AVAILABLE;
1524 }
1525 mutex_exit(hash_lock);
1526 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1527 int destroy_hdr;
1528 /*
1529 * We are in the middle of an async write. Don't destroy
1530 * this buffer unless the write completes before we finish
1531 * decrementing the reference count.
1532 */
1533 mutex_enter(&arc_eviction_mtx);
1534 (void) remove_reference(hdr, NULL, tag);
1535 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1536 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1537 mutex_exit(&arc_eviction_mtx);
1538 if (destroy_hdr)
1539 arc_hdr_destroy(hdr);
1540 } else {
1541 if (remove_reference(hdr, NULL, tag) > 0) {
1542 ASSERT(HDR_IO_ERROR(hdr));
1543 arc_buf_destroy(buf, FALSE, TRUE);
1544 } else {
1545 arc_hdr_destroy(hdr);
1546 }
1547 }
1548 }
1549
1550 int
arc_buf_remove_ref(arc_buf_t * buf,void * tag)1551 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1552 {
1553 arc_buf_hdr_t *hdr = buf->b_hdr;
1554 kmutex_t *hash_lock = HDR_LOCK(hdr);
1555 int no_callback = (buf->b_efunc == NULL);
1556
1557 if (hdr->b_state == arc_anon) {
1558 ASSERT(hdr->b_datacnt == 1);
1559 arc_buf_free(buf, tag);
1560 return (no_callback);
1561 }
1562
1563 mutex_enter(hash_lock);
1564 ASSERT(hdr->b_state != arc_anon);
1565 ASSERT(buf->b_data != NULL);
1566
1567 (void) remove_reference(hdr, hash_lock, tag);
1568 if (hdr->b_datacnt > 1) {
1569 if (no_callback)
1570 arc_buf_destroy(buf, FALSE, TRUE);
1571 } else if (no_callback) {
1572 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1573 ASSERT(buf->b_efunc == NULL);
1574 hdr->b_flags |= ARC_BUF_AVAILABLE;
1575 }
1576 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1577 refcount_is_zero(&hdr->b_refcnt));
1578 mutex_exit(hash_lock);
1579 return (no_callback);
1580 }
1581
1582 int
arc_buf_size(arc_buf_t * buf)1583 arc_buf_size(arc_buf_t *buf)
1584 {
1585 return (buf->b_hdr->b_size);
1586 }
1587
1588 /*
1589 * Evict buffers from list until we've removed the specified number of
1590 * bytes. Move the removed buffers to the appropriate evict state.
1591 * If the recycle flag is set, then attempt to "recycle" a buffer:
1592 * - look for a buffer to evict that is `bytes' long.
1593 * - return the data block from this buffer rather than freeing it.
1594 * This flag is used by callers that are trying to make space for a
1595 * new buffer in a full arc cache.
1596 *
1597 * This function makes a "best effort". It skips over any buffers
1598 * it can't get a hash_lock on, and so may not catch all candidates.
1599 * It may also return without evicting as much space as requested.
1600 */
1601 static void *
arc_evict(arc_state_t * state,uint64_t spa,int64_t bytes,boolean_t recycle,arc_buf_contents_t type)1602 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1603 arc_buf_contents_t type)
1604 {
1605 arc_state_t *evicted_state;
1606 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1607 arc_buf_hdr_t *ab, *ab_prev = NULL;
1608 list_t *list = &state->arcs_list[type];
1609 kmutex_t *hash_lock;
1610 boolean_t have_lock;
1611 void *stolen = NULL;
1612
1613 ASSERT(state == arc_mru || state == arc_mfu);
1614
1615 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1616
1617 mutex_enter(&state->arcs_mtx);
1618 mutex_enter(&evicted_state->arcs_mtx);
1619
1620 for (ab = list_tail(list); ab; ab = ab_prev) {
1621 ab_prev = list_prev(list, ab);
1622 /* prefetch buffers have a minimum lifespan */
1623 if (HDR_IO_IN_PROGRESS(ab) ||
1624 (spa && ab->b_spa != spa) ||
1625 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1626 ddi_get_lbolt() - ab->b_arc_access <
1627 arc_min_prefetch_lifespan)) {
1628 skipped++;
1629 continue;
1630 }
1631 /* "lookahead" for better eviction candidate */
1632 if (recycle && ab->b_size != bytes &&
1633 ab_prev && ab_prev->b_size == bytes)
1634 continue;
1635 hash_lock = HDR_LOCK(ab);
1636 have_lock = MUTEX_HELD(hash_lock);
1637 if (have_lock || mutex_tryenter(hash_lock)) {
1638 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1639 ASSERT(ab->b_datacnt > 0);
1640 while (ab->b_buf) {
1641 arc_buf_t *buf = ab->b_buf;
1642 if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
1643 missed += 1;
1644 break;
1645 }
1646 if (buf->b_data) {
1647 bytes_evicted += ab->b_size;
1648 if (recycle && ab->b_type == type &&
1649 ab->b_size == bytes &&
1650 !HDR_L2_WRITING(ab)) {
1651 stolen = buf->b_data;
1652 recycle = FALSE;
1653 }
1654 }
1655 if (buf->b_efunc) {
1656 mutex_enter(&arc_eviction_mtx);
1657 arc_buf_destroy(buf,
1658 buf->b_data == stolen, FALSE);
1659 ab->b_buf = buf->b_next;
1660 buf->b_hdr = &arc_eviction_hdr;
1661 buf->b_next = arc_eviction_list;
1662 arc_eviction_list = buf;
1663 mutex_exit(&arc_eviction_mtx);
1664 rw_exit(&buf->b_lock);
1665 } else {
1666 rw_exit(&buf->b_lock);
1667 arc_buf_destroy(buf,
1668 buf->b_data == stolen, TRUE);
1669 }
1670 }
1671
1672 if (ab->b_l2hdr) {
1673 ARCSTAT_INCR(arcstat_evict_l2_cached,
1674 ab->b_size);
1675 } else {
1676 if (l2arc_write_eligible(ab->b_spa, ab)) {
1677 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1678 ab->b_size);
1679 } else {
1680 ARCSTAT_INCR(
1681 arcstat_evict_l2_ineligible,
1682 ab->b_size);
1683 }
1684 }
1685
1686 if (ab->b_datacnt == 0) {
1687 arc_change_state(evicted_state, ab, hash_lock);
1688 ASSERT(HDR_IN_HASH_TABLE(ab));
1689 ab->b_flags |= ARC_IN_HASH_TABLE;
1690 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1691 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1692 }
1693 if (!have_lock)
1694 mutex_exit(hash_lock);
1695 if (bytes >= 0 && bytes_evicted >= bytes)
1696 break;
1697 } else {
1698 missed += 1;
1699 }
1700 }
1701
1702 mutex_exit(&evicted_state->arcs_mtx);
1703 mutex_exit(&state->arcs_mtx);
1704
1705 if (bytes_evicted < bytes)
1706 dprintf("only evicted %lld bytes from %x",
1707 (longlong_t)bytes_evicted, state);
1708
1709 if (skipped)
1710 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1711
1712 if (missed)
1713 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1714
1715 /*
1716 * We have just evicted some date into the ghost state, make
1717 * sure we also adjust the ghost state size if necessary.
1718 */
1719 if (arc_no_grow &&
1720 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1721 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1722 arc_mru_ghost->arcs_size - arc_c;
1723
1724 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1725 int64_t todelete =
1726 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1727 arc_evict_ghost(arc_mru_ghost, 0, todelete);
1728 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1729 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1730 arc_mru_ghost->arcs_size +
1731 arc_mfu_ghost->arcs_size - arc_c);
1732 arc_evict_ghost(arc_mfu_ghost, 0, todelete);
1733 }
1734 }
1735
1736 return (stolen);
1737 }
1738
1739 /*
1740 * Remove buffers from list until we've removed the specified number of
1741 * bytes. Destroy the buffers that are removed.
1742 */
1743 static void
arc_evict_ghost(arc_state_t * state,uint64_t spa,int64_t bytes)1744 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1745 {
1746 arc_buf_hdr_t *ab, *ab_prev;
1747 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1748 kmutex_t *hash_lock;
1749 uint64_t bytes_deleted = 0;
1750 uint64_t bufs_skipped = 0;
1751 boolean_t have_lock;
1752
1753 ASSERT(GHOST_STATE(state));
1754 top:
1755 mutex_enter(&state->arcs_mtx);
1756 for (ab = list_tail(list); ab; ab = ab_prev) {
1757 ab_prev = list_prev(list, ab);
1758 if (spa && ab->b_spa != spa)
1759 continue;
1760 hash_lock = HDR_LOCK(ab);
1761 have_lock = MUTEX_HELD(hash_lock);
1762 if (have_lock || mutex_tryenter(hash_lock)) {
1763 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1764 ASSERT(ab->b_buf == NULL);
1765 ARCSTAT_BUMP(arcstat_deleted);
1766 bytes_deleted += ab->b_size;
1767
1768 if (ab->b_l2hdr != NULL) {
1769 /*
1770 * This buffer is cached on the 2nd Level ARC;
1771 * don't destroy the header.
1772 */
1773 arc_change_state(arc_l2c_only, ab, hash_lock);
1774 if (!have_lock)
1775 mutex_exit(hash_lock);
1776 } else {
1777 arc_change_state(arc_anon, ab, hash_lock);
1778 if (!have_lock)
1779 mutex_exit(hash_lock);
1780 arc_hdr_destroy(ab);
1781 }
1782
1783 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1784 if (bytes >= 0 && bytes_deleted >= bytes)
1785 break;
1786 } else {
1787 if (bytes < 0) {
1788 mutex_exit(&state->arcs_mtx);
1789 mutex_enter(hash_lock);
1790 mutex_exit(hash_lock);
1791 goto top;
1792 }
1793 bufs_skipped += 1;
1794 }
1795 }
1796 mutex_exit(&state->arcs_mtx);
1797
1798 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1799 (bytes < 0 || bytes_deleted < bytes)) {
1800 list = &state->arcs_list[ARC_BUFC_METADATA];
1801 goto top;
1802 }
1803
1804 if (bufs_skipped) {
1805 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1806 ASSERT(bytes >= 0);
1807 }
1808
1809 if (bytes_deleted < bytes)
1810 dprintf("only deleted %lld bytes from %p",
1811 (longlong_t)bytes_deleted, state);
1812 }
1813
1814 static void
arc_adjust(void)1815 arc_adjust(void)
1816 {
1817 int64_t adjustment, delta;
1818
1819 /*
1820 * Adjust MRU size
1821 */
1822
1823 adjustment = MIN(arc_size - arc_c,
1824 arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
1825
1826 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1827 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1828 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
1829 adjustment -= delta;
1830 }
1831
1832 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1833 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1834 (void) arc_evict(arc_mru, 0, delta, FALSE,
1835 ARC_BUFC_METADATA);
1836 }
1837
1838 /*
1839 * Adjust MFU size
1840 */
1841
1842 adjustment = arc_size - arc_c;
1843
1844 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1845 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1846 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
1847 adjustment -= delta;
1848 }
1849
1850 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1851 int64_t delta = MIN(adjustment,
1852 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1853 (void) arc_evict(arc_mfu, 0, delta, FALSE,
1854 ARC_BUFC_METADATA);
1855 }
1856
1857 /*
1858 * Adjust ghost lists
1859 */
1860
1861 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
1862
1863 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
1864 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
1865 arc_evict_ghost(arc_mru_ghost, 0, delta);
1866 }
1867
1868 adjustment =
1869 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
1870
1871 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
1872 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
1873 arc_evict_ghost(arc_mfu_ghost, 0, delta);
1874 }
1875 }
1876
1877 static void
arc_do_user_evicts(void)1878 arc_do_user_evicts(void)
1879 {
1880 mutex_enter(&arc_eviction_mtx);
1881 while (arc_eviction_list != NULL) {
1882 arc_buf_t *buf = arc_eviction_list;
1883 arc_eviction_list = buf->b_next;
1884 rw_enter(&buf->b_lock, RW_WRITER);
1885 buf->b_hdr = NULL;
1886 rw_exit(&buf->b_lock);
1887 mutex_exit(&arc_eviction_mtx);
1888
1889 if (buf->b_efunc != NULL)
1890 VERIFY(buf->b_efunc(buf) == 0);
1891
1892 buf->b_efunc = NULL;
1893 buf->b_private = NULL;
1894 kmem_cache_free(buf_cache, buf);
1895 mutex_enter(&arc_eviction_mtx);
1896 }
1897 mutex_exit(&arc_eviction_mtx);
1898 }
1899
1900 /*
1901 * Flush all *evictable* data from the cache for the given spa.
1902 * NOTE: this will not touch "active" (i.e. referenced) data.
1903 */
1904 void
arc_flush(spa_t * spa)1905 arc_flush(spa_t *spa)
1906 {
1907 uint64_t guid = 0;
1908
1909 if (spa)
1910 guid = spa_guid(spa);
1911
1912 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
1913 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
1914 if (spa)
1915 break;
1916 }
1917 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
1918 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
1919 if (spa)
1920 break;
1921 }
1922 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
1923 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
1924 if (spa)
1925 break;
1926 }
1927 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
1928 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
1929 if (spa)
1930 break;
1931 }
1932
1933 arc_evict_ghost(arc_mru_ghost, guid, -1);
1934 arc_evict_ghost(arc_mfu_ghost, guid, -1);
1935
1936 mutex_enter(&arc_reclaim_thr_lock);
1937 arc_do_user_evicts();
1938 mutex_exit(&arc_reclaim_thr_lock);
1939 ASSERT(spa || arc_eviction_list == NULL);
1940 }
1941
1942 void
arc_shrink(void)1943 arc_shrink(void)
1944 {
1945 if (arc_c > arc_c_min) {
1946 uint64_t to_free;
1947
1948 #ifdef _KERNEL
1949 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
1950 #else
1951 to_free = arc_c >> arc_shrink_shift;
1952 #endif
1953 if (arc_c > arc_c_min + to_free)
1954 atomic_add_64(&arc_c, -to_free);
1955 else
1956 arc_c = arc_c_min;
1957
1958 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1959 if (arc_c > arc_size)
1960 arc_c = MAX(arc_size, arc_c_min);
1961 if (arc_p > arc_c)
1962 arc_p = (arc_c >> 1);
1963 ASSERT(arc_c >= arc_c_min);
1964 ASSERT((int64_t)arc_p >= 0);
1965 }
1966
1967 if (arc_size > arc_c)
1968 arc_adjust();
1969 }
1970
1971 static int
arc_reclaim_needed(void)1972 arc_reclaim_needed(void)
1973 {
1974 uint64_t extra;
1975
1976 #ifdef _KERNEL
1977
1978 if (needfree)
1979 return (1);
1980
1981 /*
1982 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1983 */
1984 extra = desfree;
1985
1986 /*
1987 * check that we're out of range of the pageout scanner. It starts to
1988 * schedule paging if freemem is less than lotsfree and needfree.
1989 * lotsfree is the high-water mark for pageout, and needfree is the
1990 * number of needed free pages. We add extra pages here to make sure
1991 * the scanner doesn't start up while we're freeing memory.
1992 */
1993 if (freemem < lotsfree + needfree + extra)
1994 return (1);
1995
1996 /*
1997 * check to make sure that swapfs has enough space so that anon
1998 * reservations can still succeed. anon_resvmem() checks that the
1999 * availrmem is greater than swapfs_minfree, and the number of reserved
2000 * swap pages. We also add a bit of extra here just to prevent
2001 * circumstances from getting really dire.
2002 */
2003 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2004 return (1);
2005
2006 #if defined(__i386)
2007 /*
2008 * If we're on an i386 platform, it's possible that we'll exhaust the
2009 * kernel heap space before we ever run out of available physical
2010 * memory. Most checks of the size of the kmem_area compare against
2011 * tune.t_minarmem, which is the minimum available real memory that we
2012 * can have in the system. However, this is generally fixed at 25 pages
2013 * which is so low that it's useless. In this comparison, we seek to
2014 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2015 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2016 * free)
2017 */
2018 if (btop(vmem_size(kmem_arena, VMEM_FREE)) <
2019 (btop(vmem_size(kmem_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2020 return (1);
2021 #endif
2022
2023 #else
2024 if (spa_get_random(100) == 0)
2025 return (1);
2026 #endif
2027 return (0);
2028 }
2029
2030 static void
arc_kmem_reap_now(arc_reclaim_strategy_t strat)2031 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2032 {
2033 size_t i;
2034 kmem_cache_t *prev_cache = NULL;
2035 kmem_cache_t *prev_data_cache = NULL;
2036 extern kmem_cache_t *zio_buf_cache[];
2037 extern kmem_cache_t *zio_data_buf_cache[];
2038
2039 #ifdef _KERNEL
2040 if (arc_meta_used >= arc_meta_limit) {
2041 /*
2042 * We are exceeding our meta-data cache limit.
2043 * Purge some DNLC entries to release holds on meta-data.
2044 */
2045 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2046 }
2047 #if defined(__i386)
2048 /*
2049 * Reclaim unused memory from all kmem caches.
2050 */
2051 kmem_reap();
2052 #endif
2053 #endif
2054
2055 /*
2056 * An aggressive reclamation will shrink the cache size as well as
2057 * reap free buffers from the arc kmem caches.
2058 */
2059 if (strat == ARC_RECLAIM_AGGR)
2060 arc_shrink();
2061
2062 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2063 if (zio_buf_cache[i] != prev_cache) {
2064 prev_cache = zio_buf_cache[i];
2065 kmem_cache_reap_now(zio_buf_cache[i]);
2066 }
2067 if (zio_data_buf_cache[i] != prev_data_cache) {
2068 prev_data_cache = zio_data_buf_cache[i];
2069 kmem_cache_reap_now(zio_data_buf_cache[i]);
2070 }
2071 }
2072 kmem_cache_reap_now(buf_cache);
2073 kmem_cache_reap_now(hdr_cache);
2074 }
2075
2076 static void
arc_reclaim_thread(void * unused __unused)2077 arc_reclaim_thread(void *unused __unused)
2078 {
2079 clock_t growtime = 0;
2080 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2081 callb_cpr_t cpr;
2082
2083 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2084
2085 mutex_enter(&arc_reclaim_thr_lock);
2086 while (arc_thread_exit == 0) {
2087 if (arc_reclaim_needed()) {
2088
2089 if (arc_no_grow) {
2090 if (last_reclaim == ARC_RECLAIM_CONS) {
2091 last_reclaim = ARC_RECLAIM_AGGR;
2092 } else {
2093 last_reclaim = ARC_RECLAIM_CONS;
2094 }
2095 } else {
2096 arc_no_grow = TRUE;
2097 last_reclaim = ARC_RECLAIM_AGGR;
2098 membar_producer();
2099 }
2100
2101 /* reset the growth delay for every reclaim */
2102 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2103
2104 arc_kmem_reap_now(last_reclaim);
2105 arc_warm = B_TRUE;
2106
2107 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2108 arc_no_grow = FALSE;
2109 }
2110
2111 if (2 * arc_c < arc_size +
2112 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
2113 arc_adjust();
2114
2115 if (arc_eviction_list != NULL)
2116 arc_do_user_evicts();
2117
2118 /* block until needed, or one second, whichever is shorter */
2119 CALLB_CPR_SAFE_BEGIN(&cpr);
2120 (void) cv_timedwait(&arc_reclaim_thr_cv,
2121 &arc_reclaim_thr_lock, (hz));
2122 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2123 }
2124
2125 arc_thread_exit = 0;
2126 cv_broadcast(&arc_reclaim_thr_cv);
2127 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2128 thread_exit();
2129 }
2130
2131 /*
2132 * Adapt arc info given the number of bytes we are trying to add and
2133 * the state that we are comming from. This function is only called
2134 * when we are adding new content to the cache.
2135 */
2136 static void
arc_adapt(int bytes,arc_state_t * state)2137 arc_adapt(int bytes, arc_state_t *state)
2138 {
2139 int mult;
2140 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2141
2142 if (state == arc_l2c_only)
2143 return;
2144
2145 ASSERT(bytes > 0);
2146 /*
2147 * Adapt the target size of the MRU list:
2148 * - if we just hit in the MRU ghost list, then increase
2149 * the target size of the MRU list.
2150 * - if we just hit in the MFU ghost list, then increase
2151 * the target size of the MFU list by decreasing the
2152 * target size of the MRU list.
2153 */
2154 if (state == arc_mru_ghost) {
2155 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2156 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2157
2158 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2159 } else if (state == arc_mfu_ghost) {
2160 uint64_t delta;
2161
2162 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2163 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2164
2165 delta = MIN(bytes * mult, arc_p);
2166 arc_p = MAX(arc_p_min, arc_p - delta);
2167 }
2168 ASSERT((int64_t)arc_p >= 0);
2169
2170 if (arc_reclaim_needed()) {
2171 cv_signal(&arc_reclaim_thr_cv);
2172 return;
2173 }
2174
2175 if (arc_no_grow)
2176 return;
2177
2178 if (arc_c >= arc_c_max)
2179 return;
2180
2181 /*
2182 * If we're within (2 * maxblocksize) bytes of the target
2183 * cache size, increment the target cache size
2184 */
2185 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2186 atomic_add_64(&arc_c, (int64_t)bytes);
2187 if (arc_c > arc_c_max)
2188 arc_c = arc_c_max;
2189 else if (state == arc_anon)
2190 atomic_add_64(&arc_p, (int64_t)bytes);
2191 if (arc_p > arc_c)
2192 arc_p = arc_c;
2193 }
2194 ASSERT((int64_t)arc_p >= 0);
2195 }
2196
2197 /*
2198 * Check if the cache has reached its limits and eviction is required
2199 * prior to insert.
2200 */
2201 static int
arc_evict_needed(arc_buf_contents_t type)2202 arc_evict_needed(arc_buf_contents_t type)
2203 {
2204 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2205 return (1);
2206
2207 #ifdef _KERNEL
2208 /*
2209 * If zio data pages are being allocated out of a separate heap segment,
2210 * then enforce that the size of available vmem for this area remains
2211 * above about 1/32nd free.
2212 */
2213 if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2214 vmem_size(zio_arena, VMEM_FREE) <
2215 (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2216 return (1);
2217 #endif
2218
2219 if (arc_reclaim_needed())
2220 return (1);
2221
2222 return (arc_size > arc_c);
2223 }
2224
2225 /*
2226 * The buffer, supplied as the first argument, needs a data block.
2227 * So, if we are at cache max, determine which cache should be victimized.
2228 * We have the following cases:
2229 *
2230 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2231 * In this situation if we're out of space, but the resident size of the MFU is
2232 * under the limit, victimize the MFU cache to satisfy this insertion request.
2233 *
2234 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2235 * Here, we've used up all of the available space for the MRU, so we need to
2236 * evict from our own cache instead. Evict from the set of resident MRU
2237 * entries.
2238 *
2239 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2240 * c minus p represents the MFU space in the cache, since p is the size of the
2241 * cache that is dedicated to the MRU. In this situation there's still space on
2242 * the MFU side, so the MRU side needs to be victimized.
2243 *
2244 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2245 * MFU's resident set is consuming more space than it has been allotted. In
2246 * this situation, we must victimize our own cache, the MFU, for this insertion.
2247 */
2248 static void
arc_get_data_buf(arc_buf_t * buf)2249 arc_get_data_buf(arc_buf_t *buf)
2250 {
2251 arc_state_t *state = buf->b_hdr->b_state;
2252 uint64_t size = buf->b_hdr->b_size;
2253 arc_buf_contents_t type = buf->b_hdr->b_type;
2254
2255 arc_adapt(size, state);
2256
2257 /*
2258 * We have not yet reached cache maximum size,
2259 * just allocate a new buffer.
2260 */
2261 if (!arc_evict_needed(type)) {
2262 if (type == ARC_BUFC_METADATA) {
2263 buf->b_data = zio_buf_alloc(size);
2264 arc_space_consume(size, ARC_SPACE_DATA);
2265 } else {
2266 ASSERT(type == ARC_BUFC_DATA);
2267 buf->b_data = zio_data_buf_alloc(size);
2268 ARCSTAT_INCR(arcstat_data_size, size);
2269 atomic_add_64(&arc_size, size);
2270 }
2271 goto out;
2272 }
2273
2274 /*
2275 * If we are prefetching from the mfu ghost list, this buffer
2276 * will end up on the mru list; so steal space from there.
2277 */
2278 if (state == arc_mfu_ghost)
2279 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2280 else if (state == arc_mru_ghost)
2281 state = arc_mru;
2282
2283 if (state == arc_mru || state == arc_anon) {
2284 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2285 state = (arc_mfu->arcs_lsize[type] >= size &&
2286 arc_p > mru_used) ? arc_mfu : arc_mru;
2287 } else {
2288 /* MFU cases */
2289 uint64_t mfu_space = arc_c - arc_p;
2290 state = (arc_mru->arcs_lsize[type] >= size &&
2291 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2292 }
2293 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2294 if (type == ARC_BUFC_METADATA) {
2295 buf->b_data = zio_buf_alloc(size);
2296 arc_space_consume(size, ARC_SPACE_DATA);
2297 } else {
2298 ASSERT(type == ARC_BUFC_DATA);
2299 buf->b_data = zio_data_buf_alloc(size);
2300 ARCSTAT_INCR(arcstat_data_size, size);
2301 atomic_add_64(&arc_size, size);
2302 }
2303 ARCSTAT_BUMP(arcstat_recycle_miss);
2304 }
2305 ASSERT(buf->b_data != NULL);
2306 out:
2307 /*
2308 * Update the state size. Note that ghost states have a
2309 * "ghost size" and so don't need to be updated.
2310 */
2311 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2312 arc_buf_hdr_t *hdr = buf->b_hdr;
2313
2314 atomic_add_64(&hdr->b_state->arcs_size, size);
2315 if (list_link_active(&hdr->b_arc_node)) {
2316 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2317 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2318 }
2319 /*
2320 * If we are growing the cache, and we are adding anonymous
2321 * data, and we have outgrown arc_p, update arc_p
2322 */
2323 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2324 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2325 arc_p = MIN(arc_c, arc_p + size);
2326 }
2327 }
2328
2329 /*
2330 * This routine is called whenever a buffer is accessed.
2331 * NOTE: the hash lock is dropped in this function.
2332 */
2333 static void
arc_access(arc_buf_hdr_t * buf,kmutex_t * hash_lock)2334 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2335 {
2336 clock_t now;
2337
2338 ASSERT(MUTEX_HELD(hash_lock));
2339
2340 if (buf->b_state == arc_anon) {
2341 /*
2342 * This buffer is not in the cache, and does not
2343 * appear in our "ghost" list. Add the new buffer
2344 * to the MRU state.
2345 */
2346
2347 ASSERT(buf->b_arc_access == 0);
2348 buf->b_arc_access = ddi_get_lbolt();
2349 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2350 arc_change_state(arc_mru, buf, hash_lock);
2351
2352 } else if (buf->b_state == arc_mru) {
2353 now = ddi_get_lbolt();
2354
2355 /*
2356 * If this buffer is here because of a prefetch, then either:
2357 * - clear the flag if this is a "referencing" read
2358 * (any subsequent access will bump this into the MFU state).
2359 * or
2360 * - move the buffer to the head of the list if this is
2361 * another prefetch (to make it less likely to be evicted).
2362 */
2363 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2364 if (refcount_count(&buf->b_refcnt) == 0) {
2365 ASSERT(list_link_active(&buf->b_arc_node));
2366 } else {
2367 buf->b_flags &= ~ARC_PREFETCH;
2368 ARCSTAT_BUMP(arcstat_mru_hits);
2369 }
2370 buf->b_arc_access = now;
2371 return;
2372 }
2373
2374 /*
2375 * This buffer has been "accessed" only once so far,
2376 * but it is still in the cache. Move it to the MFU
2377 * state.
2378 */
2379 if (now > buf->b_arc_access + ARC_MINTIME) {
2380 /*
2381 * More than 125ms have passed since we
2382 * instantiated this buffer. Move it to the
2383 * most frequently used state.
2384 */
2385 buf->b_arc_access = now;
2386 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2387 arc_change_state(arc_mfu, buf, hash_lock);
2388 }
2389 ARCSTAT_BUMP(arcstat_mru_hits);
2390 } else if (buf->b_state == arc_mru_ghost) {
2391 arc_state_t *new_state;
2392 /*
2393 * This buffer has been "accessed" recently, but
2394 * was evicted from the cache. Move it to the
2395 * MFU state.
2396 */
2397
2398 if (buf->b_flags & ARC_PREFETCH) {
2399 new_state = arc_mru;
2400 if (refcount_count(&buf->b_refcnt) > 0)
2401 buf->b_flags &= ~ARC_PREFETCH;
2402 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2403 } else {
2404 new_state = arc_mfu;
2405 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2406 }
2407
2408 buf->b_arc_access = ddi_get_lbolt();
2409 arc_change_state(new_state, buf, hash_lock);
2410
2411 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2412 } else if (buf->b_state == arc_mfu) {
2413 /*
2414 * This buffer has been accessed more than once and is
2415 * still in the cache. Keep it in the MFU state.
2416 *
2417 * NOTE: an add_reference() that occurred when we did
2418 * the arc_read() will have kicked this off the list.
2419 * If it was a prefetch, we will explicitly move it to
2420 * the head of the list now.
2421 */
2422 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2423 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2424 ASSERT(list_link_active(&buf->b_arc_node));
2425 }
2426 ARCSTAT_BUMP(arcstat_mfu_hits);
2427 buf->b_arc_access = ddi_get_lbolt();
2428 } else if (buf->b_state == arc_mfu_ghost) {
2429 arc_state_t *new_state = arc_mfu;
2430 /*
2431 * This buffer has been accessed more than once but has
2432 * been evicted from the cache. Move it back to the
2433 * MFU state.
2434 */
2435
2436 if (buf->b_flags & ARC_PREFETCH) {
2437 /*
2438 * This is a prefetch access...
2439 * move this block back to the MRU state.
2440 */
2441 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2442 new_state = arc_mru;
2443 }
2444
2445 buf->b_arc_access = ddi_get_lbolt();
2446 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2447 arc_change_state(new_state, buf, hash_lock);
2448
2449 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2450 } else if (buf->b_state == arc_l2c_only) {
2451 /*
2452 * This buffer is on the 2nd Level ARC.
2453 */
2454
2455 buf->b_arc_access = ddi_get_lbolt();
2456 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2457 arc_change_state(arc_mfu, buf, hash_lock);
2458 } else {
2459 ASSERT(!"invalid arc state");
2460 }
2461 }
2462
2463 /* a generic arc_done_func_t which you can use */
2464 /* ARGSUSED */
2465 void
arc_bcopy_func(zio_t * zio,arc_buf_t * buf,void * arg)2466 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2467 {
2468 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2469 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2470 }
2471
2472 /* a generic arc_done_func_t */
2473 void
arc_getbuf_func(zio_t * zio,arc_buf_t * buf,void * arg)2474 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2475 {
2476 arc_buf_t **bufp = arg;
2477 if (zio && zio->io_error) {
2478 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2479 *bufp = NULL;
2480 } else {
2481 *bufp = buf;
2482 }
2483 }
2484
2485 static void
arc_read_done(zio_t * zio)2486 arc_read_done(zio_t *zio)
2487 {
2488 arc_buf_hdr_t *hdr, *found;
2489 arc_buf_t *buf;
2490 arc_buf_t *abuf; /* buffer we're assigning to callback */
2491 kmutex_t *hash_lock;
2492 arc_callback_t *callback_list, *acb;
2493 int freeable = FALSE;
2494
2495 buf = zio->io_private;
2496 hdr = buf->b_hdr;
2497
2498 /*
2499 * The hdr was inserted into hash-table and removed from lists
2500 * prior to starting I/O. We should find this header, since
2501 * it's in the hash table, and it should be legit since it's
2502 * not possible to evict it during the I/O. The only possible
2503 * reason for it not to be found is if we were freed during the
2504 * read.
2505 */
2506 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2507 &hash_lock);
2508
2509 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2510 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2511 (found == hdr && HDR_L2_READING(hdr)));
2512
2513 hdr->b_flags &= ~ARC_L2_EVICTED;
2514 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2515 hdr->b_flags &= ~ARC_L2CACHE;
2516
2517 /* byteswap if necessary */
2518 callback_list = hdr->b_acb;
2519 ASSERT(callback_list != NULL);
2520 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2521 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2522 byteswap_uint64_array :
2523 dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2524 func(buf->b_data, hdr->b_size);
2525 }
2526
2527 arc_cksum_compute(buf, B_FALSE);
2528
2529 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2530 /*
2531 * Only call arc_access on anonymous buffers. This is because
2532 * if we've issued an I/O for an evicted buffer, we've already
2533 * called arc_access (to prevent any simultaneous readers from
2534 * getting confused).
2535 */
2536 arc_access(hdr, hash_lock);
2537 }
2538
2539 /* create copies of the data buffer for the callers */
2540 abuf = buf;
2541 for (acb = callback_list; acb; acb = acb->acb_next) {
2542 if (acb->acb_done) {
2543 if (abuf == NULL)
2544 abuf = arc_buf_clone(buf);
2545 acb->acb_buf = abuf;
2546 abuf = NULL;
2547 }
2548 }
2549 hdr->b_acb = NULL;
2550 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2551 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2552 if (abuf == buf) {
2553 ASSERT(buf->b_efunc == NULL);
2554 ASSERT(hdr->b_datacnt == 1);
2555 hdr->b_flags |= ARC_BUF_AVAILABLE;
2556 }
2557
2558 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2559
2560 if (zio->io_error != 0) {
2561 hdr->b_flags |= ARC_IO_ERROR;
2562 if (hdr->b_state != arc_anon)
2563 arc_change_state(arc_anon, hdr, hash_lock);
2564 if (HDR_IN_HASH_TABLE(hdr))
2565 buf_hash_remove(hdr);
2566 freeable = refcount_is_zero(&hdr->b_refcnt);
2567 }
2568
2569 /*
2570 * Broadcast before we drop the hash_lock to avoid the possibility
2571 * that the hdr (and hence the cv) might be freed before we get to
2572 * the cv_broadcast().
2573 */
2574 cv_broadcast(&hdr->b_cv);
2575
2576 if (hash_lock) {
2577 mutex_exit(hash_lock);
2578 } else {
2579 /*
2580 * This block was freed while we waited for the read to
2581 * complete. It has been removed from the hash table and
2582 * moved to the anonymous state (so that it won't show up
2583 * in the cache).
2584 */
2585 ASSERT3P(hdr->b_state, ==, arc_anon);
2586 freeable = refcount_is_zero(&hdr->b_refcnt);
2587 }
2588
2589 /* execute each callback and free its structure */
2590 while ((acb = callback_list) != NULL) {
2591 if (acb->acb_done)
2592 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2593
2594 if (acb->acb_zio_dummy != NULL) {
2595 acb->acb_zio_dummy->io_error = zio->io_error;
2596 zio_nowait(acb->acb_zio_dummy);
2597 }
2598
2599 callback_list = acb->acb_next;
2600 kmem_free(acb, sizeof (arc_callback_t));
2601 }
2602
2603 if (freeable)
2604 arc_hdr_destroy(hdr);
2605 }
2606
2607 /*
2608 * "Read" the block block at the specified DVA (in bp) via the
2609 * cache. If the block is found in the cache, invoke the provided
2610 * callback immediately and return. Note that the `zio' parameter
2611 * in the callback will be NULL in this case, since no IO was
2612 * required. If the block is not in the cache pass the read request
2613 * on to the spa with a substitute callback function, so that the
2614 * requested block will be added to the cache.
2615 *
2616 * If a read request arrives for a block that has a read in-progress,
2617 * either wait for the in-progress read to complete (and return the
2618 * results); or, if this is a read with a "done" func, add a record
2619 * to the read to invoke the "done" func when the read completes,
2620 * and return; or just return.
2621 *
2622 * arc_read_done() will invoke all the requested "done" functions
2623 * for readers of this block.
2624 *
2625 * Normal callers should use arc_read and pass the arc buffer and offset
2626 * for the bp. But if you know you don't need locking, you can use
2627 * arc_read_bp.
2628 */
2629 int
arc_read(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_buf_t * pbuf,arc_done_func_t * done,void * private,int priority,int zio_flags,uint32_t * arc_flags,const zbookmark_t * zb)2630 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
2631 arc_done_func_t *done, void *private, int priority, int zio_flags,
2632 uint32_t *arc_flags, const zbookmark_t *zb)
2633 {
2634 int err;
2635
2636 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2637 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2638 rw_enter(&pbuf->b_lock, RW_READER);
2639
2640 err = arc_read_nolock(pio, spa, bp, done, private, priority,
2641 zio_flags, arc_flags, zb);
2642 rw_exit(&pbuf->b_lock);
2643
2644 return (err);
2645 }
2646
2647 int
arc_read_nolock(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_done_func_t * done,void * private,int priority,int zio_flags,uint32_t * arc_flags,const zbookmark_t * zb)2648 arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
2649 arc_done_func_t *done, void *private, int priority, int zio_flags,
2650 uint32_t *arc_flags, const zbookmark_t *zb)
2651 {
2652 arc_buf_hdr_t *hdr;
2653 arc_buf_t *buf;
2654 kmutex_t *hash_lock;
2655 zio_t *rzio;
2656 uint64_t guid = spa_guid(spa);
2657
2658 top:
2659 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2660 &hash_lock);
2661 if (hdr && hdr->b_datacnt > 0) {
2662
2663 *arc_flags |= ARC_CACHED;
2664
2665 if (HDR_IO_IN_PROGRESS(hdr)) {
2666
2667 if (*arc_flags & ARC_WAIT) {
2668 cv_wait(&hdr->b_cv, hash_lock);
2669 mutex_exit(hash_lock);
2670 goto top;
2671 }
2672 ASSERT(*arc_flags & ARC_NOWAIT);
2673
2674 if (done) {
2675 arc_callback_t *acb = NULL;
2676
2677 acb = kmem_zalloc(sizeof (arc_callback_t),
2678 KM_SLEEP);
2679 acb->acb_done = done;
2680 acb->acb_private = private;
2681 if (pio != NULL)
2682 acb->acb_zio_dummy = zio_null(pio,
2683 spa, NULL, NULL, NULL, zio_flags);
2684
2685 ASSERT(acb->acb_done != NULL);
2686 acb->acb_next = hdr->b_acb;
2687 hdr->b_acb = acb;
2688 add_reference(hdr, hash_lock, private);
2689 mutex_exit(hash_lock);
2690 return (0);
2691 }
2692 mutex_exit(hash_lock);
2693 return (0);
2694 }
2695
2696 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2697
2698 if (done) {
2699 add_reference(hdr, hash_lock, private);
2700 /*
2701 * If this block is already in use, create a new
2702 * copy of the data so that we will be guaranteed
2703 * that arc_release() will always succeed.
2704 */
2705 buf = hdr->b_buf;
2706 ASSERT(buf);
2707 ASSERT(buf->b_data);
2708 if (HDR_BUF_AVAILABLE(hdr)) {
2709 ASSERT(buf->b_efunc == NULL);
2710 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2711 } else {
2712 buf = arc_buf_clone(buf);
2713 }
2714
2715 } else if (*arc_flags & ARC_PREFETCH &&
2716 refcount_count(&hdr->b_refcnt) == 0) {
2717 hdr->b_flags |= ARC_PREFETCH;
2718 }
2719 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2720 arc_access(hdr, hash_lock);
2721 if (*arc_flags & ARC_L2CACHE)
2722 hdr->b_flags |= ARC_L2CACHE;
2723 mutex_exit(hash_lock);
2724 ARCSTAT_BUMP(arcstat_hits);
2725 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2726 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2727 data, metadata, hits);
2728
2729 if (done)
2730 done(NULL, buf, private);
2731 } else {
2732 uint64_t size = BP_GET_LSIZE(bp);
2733 arc_callback_t *acb;
2734 vdev_t *vd = NULL;
2735 uint64_t addr;
2736 boolean_t devw = B_FALSE;
2737
2738 if (hdr == NULL) {
2739 /* this block is not in the cache */
2740 arc_buf_hdr_t *exists;
2741 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2742 buf = arc_buf_alloc(spa, size, private, type);
2743 hdr = buf->b_hdr;
2744 hdr->b_dva = *BP_IDENTITY(bp);
2745 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2746 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2747 exists = buf_hash_insert(hdr, &hash_lock);
2748 if (exists) {
2749 /* somebody beat us to the hash insert */
2750 mutex_exit(hash_lock);
2751 bzero(&hdr->b_dva, sizeof (dva_t));
2752 hdr->b_birth = 0;
2753 hdr->b_cksum0 = 0;
2754 (void) arc_buf_remove_ref(buf, private);
2755 goto top; /* restart the IO request */
2756 }
2757 /* if this is a prefetch, we don't have a reference */
2758 if (*arc_flags & ARC_PREFETCH) {
2759 (void) remove_reference(hdr, hash_lock,
2760 private);
2761 hdr->b_flags |= ARC_PREFETCH;
2762 }
2763 if (*arc_flags & ARC_L2CACHE)
2764 hdr->b_flags |= ARC_L2CACHE;
2765 if (BP_GET_LEVEL(bp) > 0)
2766 hdr->b_flags |= ARC_INDIRECT;
2767 } else {
2768 /* this block is in the ghost cache */
2769 ASSERT(GHOST_STATE(hdr->b_state));
2770 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2771 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2772 ASSERT(hdr->b_buf == NULL);
2773
2774 /* if this is a prefetch, we don't have a reference */
2775 if (*arc_flags & ARC_PREFETCH)
2776 hdr->b_flags |= ARC_PREFETCH;
2777 else
2778 add_reference(hdr, hash_lock, private);
2779 if (*arc_flags & ARC_L2CACHE)
2780 hdr->b_flags |= ARC_L2CACHE;
2781 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2782 buf->b_hdr = hdr;
2783 buf->b_data = NULL;
2784 buf->b_efunc = NULL;
2785 buf->b_private = NULL;
2786 buf->b_next = NULL;
2787 hdr->b_buf = buf;
2788 arc_get_data_buf(buf);
2789 ASSERT(hdr->b_datacnt == 0);
2790 hdr->b_datacnt = 1;
2791 }
2792
2793 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2794 acb->acb_done = done;
2795 acb->acb_private = private;
2796
2797 ASSERT(hdr->b_acb == NULL);
2798 hdr->b_acb = acb;
2799 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2800
2801 /*
2802 * If the buffer has been evicted, migrate it to a present state
2803 * before issuing the I/O. Once we drop the hash-table lock,
2804 * the header will be marked as I/O in progress and have an
2805 * attached buffer. At this point, anybody who finds this
2806 * buffer ought to notice that it's legit but has a pending I/O.
2807 */
2808
2809 if (GHOST_STATE(hdr->b_state))
2810 arc_access(hdr, hash_lock);
2811
2812 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2813 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2814 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2815 addr = hdr->b_l2hdr->b_daddr;
2816 /*
2817 * Lock out device removal.
2818 */
2819 if (vdev_is_dead(vd) ||
2820 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2821 vd = NULL;
2822 }
2823
2824 mutex_exit(hash_lock);
2825
2826 ASSERT3U(hdr->b_size, ==, size);
2827 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2828 uint64_t, size, zbookmark_t *, zb);
2829 ARCSTAT_BUMP(arcstat_misses);
2830 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2831 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2832 data, metadata, misses);
2833
2834 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2835 /*
2836 * Read from the L2ARC if the following are true:
2837 * 1. The L2ARC vdev was previously cached.
2838 * 2. This buffer still has L2ARC metadata.
2839 * 3. This buffer isn't currently writing to the L2ARC.
2840 * 4. The L2ARC entry wasn't evicted, which may
2841 * also have invalidated the vdev.
2842 * 5. This isn't prefetch and l2arc_noprefetch is set.
2843 */
2844 if (hdr->b_l2hdr != NULL &&
2845 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2846 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2847 l2arc_read_callback_t *cb;
2848
2849 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2850 ARCSTAT_BUMP(arcstat_l2_hits);
2851
2852 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2853 KM_SLEEP);
2854 cb->l2rcb_buf = buf;
2855 cb->l2rcb_spa = spa;
2856 cb->l2rcb_bp = *bp;
2857 cb->l2rcb_zb = *zb;
2858 cb->l2rcb_flags = zio_flags;
2859
2860 /*
2861 * l2arc read. The SCL_L2ARC lock will be
2862 * released by l2arc_read_done().
2863 */
2864 rzio = zio_read_phys(pio, vd, addr, size,
2865 buf->b_data, ZIO_CHECKSUM_OFF,
2866 l2arc_read_done, cb, priority, zio_flags |
2867 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2868 ZIO_FLAG_DONT_PROPAGATE |
2869 ZIO_FLAG_DONT_RETRY, B_FALSE);
2870 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2871 zio_t *, rzio);
2872 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
2873
2874 if (*arc_flags & ARC_NOWAIT) {
2875 zio_nowait(rzio);
2876 return (0);
2877 }
2878
2879 ASSERT(*arc_flags & ARC_WAIT);
2880 if (zio_wait(rzio) == 0)
2881 return (0);
2882
2883 /* l2arc read error; goto zio_read() */
2884 } else {
2885 DTRACE_PROBE1(l2arc__miss,
2886 arc_buf_hdr_t *, hdr);
2887 ARCSTAT_BUMP(arcstat_l2_misses);
2888 if (HDR_L2_WRITING(hdr))
2889 ARCSTAT_BUMP(arcstat_l2_rw_clash);
2890 spa_config_exit(spa, SCL_L2ARC, vd);
2891 }
2892 } else {
2893 if (vd != NULL)
2894 spa_config_exit(spa, SCL_L2ARC, vd);
2895 if (l2arc_ndev != 0) {
2896 DTRACE_PROBE1(l2arc__miss,
2897 arc_buf_hdr_t *, hdr);
2898 ARCSTAT_BUMP(arcstat_l2_misses);
2899 }
2900 }
2901
2902 rzio = zio_read(pio, spa, bp, buf->b_data, size,
2903 arc_read_done, buf, priority, zio_flags, zb);
2904
2905 if (*arc_flags & ARC_WAIT)
2906 return (zio_wait(rzio));
2907
2908 ASSERT(*arc_flags & ARC_NOWAIT);
2909 zio_nowait(rzio);
2910 }
2911 return (0);
2912 }
2913
2914 void
arc_set_callback(arc_buf_t * buf,arc_evict_func_t * func,void * private)2915 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2916 {
2917 ASSERT(buf->b_hdr != NULL);
2918 ASSERT(buf->b_hdr->b_state != arc_anon);
2919 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2920 ASSERT(buf->b_efunc == NULL);
2921 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
2922
2923 buf->b_efunc = func;
2924 buf->b_private = private;
2925 }
2926
2927 /*
2928 * This is used by the DMU to let the ARC know that a buffer is
2929 * being evicted, so the ARC should clean up. If this arc buf
2930 * is not yet in the evicted state, it will be put there.
2931 */
2932 int
arc_buf_evict(arc_buf_t * buf)2933 arc_buf_evict(arc_buf_t *buf)
2934 {
2935 arc_buf_hdr_t *hdr;
2936 kmutex_t *hash_lock;
2937 arc_buf_t **bufp;
2938
2939 rw_enter(&buf->b_lock, RW_WRITER);
2940 hdr = buf->b_hdr;
2941 if (hdr == NULL) {
2942 /*
2943 * We are in arc_do_user_evicts().
2944 */
2945 ASSERT(buf->b_data == NULL);
2946 rw_exit(&buf->b_lock);
2947 return (0);
2948 } else if (buf->b_data == NULL) {
2949 arc_buf_t copy = *buf; /* structure assignment */
2950 /*
2951 * We are on the eviction list; process this buffer now
2952 * but let arc_do_user_evicts() do the reaping.
2953 */
2954 buf->b_efunc = NULL;
2955 rw_exit(&buf->b_lock);
2956 VERIFY(copy.b_efunc(©) == 0);
2957 return (1);
2958 }
2959 hash_lock = HDR_LOCK(hdr);
2960 mutex_enter(hash_lock);
2961
2962 ASSERT(buf->b_hdr == hdr);
2963 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2964 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2965
2966 /*
2967 * Pull this buffer off of the hdr
2968 */
2969 bufp = &hdr->b_buf;
2970 while (*bufp != buf)
2971 bufp = &(*bufp)->b_next;
2972 *bufp = buf->b_next;
2973
2974 ASSERT(buf->b_data != NULL);
2975 arc_buf_destroy(buf, FALSE, FALSE);
2976
2977 if (hdr->b_datacnt == 0) {
2978 arc_state_t *old_state = hdr->b_state;
2979 arc_state_t *evicted_state;
2980
2981 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2982
2983 evicted_state =
2984 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2985
2986 mutex_enter(&old_state->arcs_mtx);
2987 mutex_enter(&evicted_state->arcs_mtx);
2988
2989 arc_change_state(evicted_state, hdr, hash_lock);
2990 ASSERT(HDR_IN_HASH_TABLE(hdr));
2991 hdr->b_flags |= ARC_IN_HASH_TABLE;
2992 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2993
2994 mutex_exit(&evicted_state->arcs_mtx);
2995 mutex_exit(&old_state->arcs_mtx);
2996 }
2997 mutex_exit(hash_lock);
2998 rw_exit(&buf->b_lock);
2999
3000 VERIFY(buf->b_efunc(buf) == 0);
3001 buf->b_efunc = NULL;
3002 buf->b_private = NULL;
3003 buf->b_hdr = NULL;
3004 kmem_cache_free(buf_cache, buf);
3005 return (1);
3006 }
3007
3008 /*
3009 * Release this buffer from the cache. This must be done
3010 * after a read and prior to modifying the buffer contents.
3011 * If the buffer has more than one reference, we must make
3012 * a new hdr for the buffer.
3013 */
3014 void
arc_release(arc_buf_t * buf,void * tag)3015 arc_release(arc_buf_t *buf, void *tag)
3016 {
3017 arc_buf_hdr_t *hdr;
3018 kmutex_t *hash_lock;
3019 l2arc_buf_hdr_t *l2hdr;
3020 uint64_t buf_size;
3021 boolean_t released = B_FALSE;
3022
3023 rw_enter(&buf->b_lock, RW_WRITER);
3024 hdr = buf->b_hdr;
3025
3026 /* this buffer is not on any list */
3027 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3028
3029 if (hdr->b_state == arc_anon) {
3030 /* this buffer is already released */
3031 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
3032 ASSERT(BUF_EMPTY(hdr));
3033 ASSERT(buf->b_efunc == NULL);
3034 arc_buf_thaw(buf);
3035 rw_exit(&buf->b_lock);
3036 released = B_TRUE;
3037 } else {
3038 hash_lock = HDR_LOCK(hdr);
3039 mutex_enter(hash_lock);
3040 }
3041
3042 l2hdr = hdr->b_l2hdr;
3043 if (l2hdr) {
3044 mutex_enter(&l2arc_buflist_mtx);
3045 hdr->b_l2hdr = NULL;
3046 buf_size = hdr->b_size;
3047 }
3048
3049 if (released)
3050 goto out;
3051
3052 /*
3053 * Do we have more than one buf?
3054 */
3055 if (hdr->b_datacnt > 1) {
3056 arc_buf_hdr_t *nhdr;
3057 arc_buf_t **bufp;
3058 uint64_t blksz = hdr->b_size;
3059 uint64_t spa = hdr->b_spa;
3060 arc_buf_contents_t type = hdr->b_type;
3061 uint32_t flags = hdr->b_flags;
3062
3063 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3064 /*
3065 * Pull the data off of this buf and attach it to
3066 * a new anonymous buf.
3067 */
3068 (void) remove_reference(hdr, hash_lock, tag);
3069 bufp = &hdr->b_buf;
3070 while (*bufp != buf)
3071 bufp = &(*bufp)->b_next;
3072 *bufp = (*bufp)->b_next;
3073 buf->b_next = NULL;
3074
3075 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3076 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3077 if (refcount_is_zero(&hdr->b_refcnt)) {
3078 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3079 ASSERT3U(*size, >=, hdr->b_size);
3080 atomic_add_64(size, -hdr->b_size);
3081 }
3082 hdr->b_datacnt -= 1;
3083 arc_cksum_verify(buf);
3084
3085 mutex_exit(hash_lock);
3086
3087 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3088 nhdr->b_size = blksz;
3089 nhdr->b_spa = spa;
3090 nhdr->b_type = type;
3091 nhdr->b_buf = buf;
3092 nhdr->b_state = arc_anon;
3093 nhdr->b_arc_access = 0;
3094 nhdr->b_flags = flags & ARC_L2_WRITING;
3095 nhdr->b_l2hdr = NULL;
3096 nhdr->b_datacnt = 1;
3097 nhdr->b_freeze_cksum = NULL;
3098 (void) refcount_add(&nhdr->b_refcnt, tag);
3099 buf->b_hdr = nhdr;
3100 rw_exit(&buf->b_lock);
3101 atomic_add_64(&arc_anon->arcs_size, blksz);
3102 } else {
3103 rw_exit(&buf->b_lock);
3104 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3105 ASSERT(!list_link_active(&hdr->b_arc_node));
3106 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3107 arc_change_state(arc_anon, hdr, hash_lock);
3108 hdr->b_arc_access = 0;
3109 mutex_exit(hash_lock);
3110
3111 bzero(&hdr->b_dva, sizeof (dva_t));
3112 hdr->b_birth = 0;
3113 hdr->b_cksum0 = 0;
3114 arc_buf_thaw(buf);
3115 }
3116 buf->b_efunc = NULL;
3117 buf->b_private = NULL;
3118
3119 out:
3120 if (l2hdr) {
3121 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3122 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3123 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3124 mutex_exit(&l2arc_buflist_mtx);
3125 }
3126 }
3127
3128 int
arc_released(arc_buf_t * buf)3129 arc_released(arc_buf_t *buf)
3130 {
3131 int released;
3132
3133 rw_enter(&buf->b_lock, RW_READER);
3134 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3135 rw_exit(&buf->b_lock);
3136 return (released);
3137 }
3138
3139 int
arc_has_callback(arc_buf_t * buf)3140 arc_has_callback(arc_buf_t *buf)
3141 {
3142 int callback;
3143
3144 rw_enter(&buf->b_lock, RW_READER);
3145 callback = (buf->b_efunc != NULL);
3146 rw_exit(&buf->b_lock);
3147 return (callback);
3148 }
3149
3150 #ifdef ZFS_DEBUG
3151 int
arc_referenced(arc_buf_t * buf)3152 arc_referenced(arc_buf_t *buf)
3153 {
3154 int referenced;
3155
3156 rw_enter(&buf->b_lock, RW_READER);
3157 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3158 rw_exit(&buf->b_lock);
3159 return (referenced);
3160 }
3161 #endif
3162
3163 static void
arc_write_ready(zio_t * zio)3164 arc_write_ready(zio_t *zio)
3165 {
3166 arc_write_callback_t *callback = zio->io_private;
3167 arc_buf_t *buf = callback->awcb_buf;
3168 arc_buf_hdr_t *hdr = buf->b_hdr;
3169
3170 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3171 callback->awcb_ready(zio, buf, callback->awcb_private);
3172
3173 /*
3174 * If the IO is already in progress, then this is a re-write
3175 * attempt, so we need to thaw and re-compute the cksum.
3176 * It is the responsibility of the callback to handle the
3177 * accounting for any re-write attempt.
3178 */
3179 if (HDR_IO_IN_PROGRESS(hdr)) {
3180 mutex_enter(&hdr->b_freeze_lock);
3181 if (hdr->b_freeze_cksum != NULL) {
3182 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3183 hdr->b_freeze_cksum = NULL;
3184 }
3185 mutex_exit(&hdr->b_freeze_lock);
3186 }
3187 arc_cksum_compute(buf, B_FALSE);
3188 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3189 }
3190
3191 static void
arc_write_done(zio_t * zio)3192 arc_write_done(zio_t *zio)
3193 {
3194 arc_write_callback_t *callback = zio->io_private;
3195 arc_buf_t *buf = callback->awcb_buf;
3196 arc_buf_hdr_t *hdr = buf->b_hdr;
3197
3198 ASSERT(hdr->b_acb == NULL);
3199
3200 if (zio->io_error == 0) {
3201 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3202 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3203 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3204 } else {
3205 ASSERT(BUF_EMPTY(hdr));
3206 }
3207
3208 /*
3209 * If the block to be written was all-zero, we may have
3210 * compressed it away. In this case no write was performed
3211 * so there will be no dva/birth-date/checksum. The buffer
3212 * must therefor remain anonymous (and uncached).
3213 */
3214 if (!BUF_EMPTY(hdr)) {
3215 arc_buf_hdr_t *exists;
3216 kmutex_t *hash_lock;
3217
3218 ASSERT(zio->io_error == 0);
3219
3220 arc_cksum_verify(buf);
3221
3222 exists = buf_hash_insert(hdr, &hash_lock);
3223 if (exists) {
3224 /*
3225 * This can only happen if we overwrite for
3226 * sync-to-convergence, because we remove
3227 * buffers from the hash table when we arc_free().
3228 */
3229 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3230 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3231 panic("bad overwrite, hdr=%p exists=%p",
3232 (void *)hdr, (void *)exists);
3233 ASSERT(refcount_is_zero(&exists->b_refcnt));
3234 arc_change_state(arc_anon, exists, hash_lock);
3235 mutex_exit(hash_lock);
3236 arc_hdr_destroy(exists);
3237 exists = buf_hash_insert(hdr, &hash_lock);
3238 ASSERT3P(exists, ==, NULL);
3239 } else {
3240 /* Dedup */
3241 ASSERT(hdr->b_datacnt == 1);
3242 ASSERT(hdr->b_state == arc_anon);
3243 ASSERT(BP_GET_DEDUP(zio->io_bp));
3244 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3245 }
3246 }
3247 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3248 /* if it's not anon, we are doing a scrub */
3249 if (!exists && hdr->b_state == arc_anon)
3250 arc_access(hdr, hash_lock);
3251 mutex_exit(hash_lock);
3252 } else {
3253 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3254 }
3255
3256 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3257 callback->awcb_done(zio, buf, callback->awcb_private);
3258
3259 kmem_free(callback, sizeof (arc_write_callback_t));
3260 }
3261
3262 zio_t *
arc_write(zio_t * pio,spa_t * spa,uint64_t txg,blkptr_t * bp,arc_buf_t * buf,boolean_t l2arc,const zio_prop_t * zp,arc_done_func_t * ready,arc_done_func_t * done,void * private,int priority,int zio_flags,const zbookmark_t * zb)3263 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3264 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3265 arc_done_func_t *ready, arc_done_func_t *done, void *private,
3266 int priority, int zio_flags, const zbookmark_t *zb)
3267 {
3268 arc_buf_hdr_t *hdr = buf->b_hdr;
3269 arc_write_callback_t *callback;
3270 zio_t *zio;
3271
3272 ASSERT(ready != NULL);
3273 ASSERT(done != NULL);
3274 ASSERT(!HDR_IO_ERROR(hdr));
3275 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3276 ASSERT(hdr->b_acb == NULL);
3277 if (l2arc)
3278 hdr->b_flags |= ARC_L2CACHE;
3279 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3280 callback->awcb_ready = ready;
3281 callback->awcb_done = done;
3282 callback->awcb_private = private;
3283 callback->awcb_buf = buf;
3284
3285 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3286 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3287
3288 return (zio);
3289 }
3290
3291 void
arc_free(spa_t * spa,const blkptr_t * bp)3292 arc_free(spa_t *spa, const blkptr_t *bp)
3293 {
3294 arc_buf_hdr_t *ab;
3295 kmutex_t *hash_lock;
3296 uint64_t guid = spa_guid(spa);
3297
3298 /*
3299 * If this buffer is in the cache, release it, so it can be re-used.
3300 */
3301 ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3302 &hash_lock);
3303 if (ab != NULL) {
3304 if (ab->b_state != arc_anon)
3305 arc_change_state(arc_anon, ab, hash_lock);
3306 if (HDR_IO_IN_PROGRESS(ab)) {
3307 /*
3308 * This should only happen when we prefetch.
3309 */
3310 ASSERT(ab->b_flags & ARC_PREFETCH);
3311 ASSERT3U(ab->b_datacnt, ==, 1);
3312 ab->b_flags |= ARC_FREED_IN_READ;
3313 if (HDR_IN_HASH_TABLE(ab))
3314 buf_hash_remove(ab);
3315 ab->b_arc_access = 0;
3316 bzero(&ab->b_dva, sizeof (dva_t));
3317 ab->b_birth = 0;
3318 ab->b_cksum0 = 0;
3319 ab->b_buf->b_efunc = NULL;
3320 ab->b_buf->b_private = NULL;
3321 mutex_exit(hash_lock);
3322 } else {
3323 ASSERT(refcount_is_zero(&ab->b_refcnt));
3324 ab->b_flags |= ARC_FREE_IN_PROGRESS;
3325 mutex_exit(hash_lock);
3326 arc_hdr_destroy(ab);
3327 ARCSTAT_BUMP(arcstat_deleted);
3328 }
3329 }
3330 }
3331
3332 static int
arc_memory_throttle(uint64_t reserve,uint64_t inflight_data,uint64_t txg)3333 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3334 {
3335 #ifdef _KERNEL
3336 uint64_t available_memory = ptob(freemem);
3337 static uint64_t page_load = 0;
3338 static uint64_t last_txg = 0;
3339
3340 available_memory =
3341 MIN(available_memory, vmem_size(kmem_arena, VMEM_FREE));
3342 if (available_memory >= zfs_write_limit_max)
3343 return (0);
3344
3345 if (txg > last_txg) {
3346 last_txg = txg;
3347 page_load = 0;
3348 }
3349 /*
3350 * If we are in pageout, we know that memory is already tight,
3351 * the arc is already going to be evicting, so we just want to
3352 * continue to let page writes occur as quickly as possible.
3353 */
3354 if (curproc == proc_pageout) {
3355 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3356 return (ERESTART);
3357 /* Note: reserve is inflated, so we deflate */
3358 page_load += reserve / 8;
3359 return (0);
3360 } else if (page_load > 0 && arc_reclaim_needed()) {
3361 /* memory is low, delay before restarting */
3362 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3363 return (EAGAIN);
3364 }
3365 page_load = 0;
3366
3367 if (arc_size > arc_c_min) {
3368 uint64_t evictable_memory =
3369 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3370 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3371 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3372 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3373 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3374 }
3375
3376 if (inflight_data > available_memory / 4) {
3377 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3378 return (ERESTART);
3379 }
3380 #endif
3381 return (0);
3382 }
3383
3384 void
arc_tempreserve_clear(uint64_t reserve)3385 arc_tempreserve_clear(uint64_t reserve)
3386 {
3387 atomic_add_64(&arc_tempreserve, -reserve);
3388 ASSERT((int64_t)arc_tempreserve >= 0);
3389 }
3390
3391 int
arc_tempreserve_space(uint64_t reserve,uint64_t txg)3392 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3393 {
3394 int error;
3395 uint64_t anon_size;
3396
3397 #ifdef ZFS_DEBUG
3398 /*
3399 * Once in a while, fail for no reason. Everything should cope.
3400 */
3401 if (spa_get_random(10000) == 0) {
3402 dprintf("forcing random failure\n");
3403 return (ERESTART);
3404 }
3405 #endif
3406 if (reserve > arc_c/4 && !arc_no_grow)
3407 arc_c = MIN(arc_c_max, reserve * 4);
3408 if (reserve > arc_c)
3409 return (ENOMEM);
3410
3411 /*
3412 * Don't count loaned bufs as in flight dirty data to prevent long
3413 * network delays from blocking transactions that are ready to be
3414 * assigned to a txg.
3415 */
3416 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3417
3418 /*
3419 * Writes will, almost always, require additional memory allocations
3420 * in order to compress/encrypt/etc the data. We therefor need to
3421 * make sure that there is sufficient available memory for this.
3422 */
3423 if (error = arc_memory_throttle(reserve, anon_size, txg))
3424 return (error);
3425
3426 /*
3427 * Throttle writes when the amount of dirty data in the cache
3428 * gets too large. We try to keep the cache less than half full
3429 * of dirty blocks so that our sync times don't grow too large.
3430 * Note: if two requests come in concurrently, we might let them
3431 * both succeed, when one of them should fail. Not a huge deal.
3432 */
3433
3434 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3435 anon_size > arc_c / 4) {
3436 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3437 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3438 arc_tempreserve>>10,
3439 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3440 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3441 reserve>>10, arc_c>>10);
3442 return (ERESTART);
3443 }
3444 atomic_add_64(&arc_tempreserve, reserve);
3445 return (0);
3446 }
3447
3448 #if defined(__NetBSD__) && defined(_KERNEL)
3449 /* Reclaim hook registered to uvm for reclaiming KVM and memory */
3450 static void
arc_uvm_reclaim_hook(void)3451 arc_uvm_reclaim_hook(void)
3452 {
3453
3454 if (mutex_tryenter(&arc_reclaim_thr_lock)) {
3455 cv_broadcast(&arc_reclaim_thr_cv);
3456 mutex_exit(&arc_reclaim_thr_lock);
3457 }
3458 }
3459
3460 static int
arc_kva_reclaim_callback(struct callback_entry * ce,void * obj,void * arg)3461 arc_kva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
3462 {
3463
3464
3465 if (mutex_tryenter(&arc_reclaim_thr_lock)) {
3466 cv_broadcast(&arc_reclaim_thr_cv);
3467 mutex_exit(&arc_reclaim_thr_lock);
3468 }
3469
3470 return CALLBACK_CHAIN_CONTINUE;
3471 }
3472
3473 #endif /* __NetBSD__ */
3474
3475 void
arc_init(void)3476 arc_init(void)
3477 {
3478 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3479 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3480
3481 /* Convert seconds to clock ticks */
3482 arc_min_prefetch_lifespan = 1 * hz;
3483
3484 /* Start out with 1/8 of all memory */
3485 arc_c = physmem * PAGESIZE / 8;
3486
3487 #ifdef _KERNEL
3488 /*
3489 * On architectures where the physical memory can be larger
3490 * than the addressable space (intel in 32-bit mode), we may
3491 * need to limit the cache to 1/8 of VM size.
3492 */
3493 arc_c = MIN(arc_c, vmem_size(kmem_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3494 #endif
3495
3496 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3497 arc_c_min = MAX(arc_c / 4, 64<<20);
3498 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3499 if (arc_c * 8 >= 1<<30)
3500 arc_c_max = (arc_c * 8) - (1<<30);
3501 else
3502 arc_c_max = arc_c_min;
3503 arc_c_max = MAX(arc_c * 6, arc_c_max);
3504
3505 /*
3506 * Allow the tunables to override our calculations if they are
3507 * reasonable (ie. over 64MB)
3508 */
3509 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3510 arc_c_max = zfs_arc_max;
3511 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3512 arc_c_min = zfs_arc_min;
3513
3514 arc_c = arc_c_max;
3515 arc_p = (arc_c >> 1);
3516
3517 /* limit meta-data to 1/4 of the arc capacity */
3518 arc_meta_limit = arc_c_max / 4;
3519
3520 /* Allow the tunable to override if it is reasonable */
3521 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3522 arc_meta_limit = zfs_arc_meta_limit;
3523
3524 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3525 arc_c_min = arc_meta_limit / 2;
3526
3527 if (zfs_arc_grow_retry > 0)
3528 arc_grow_retry = zfs_arc_grow_retry;
3529
3530 if (zfs_arc_shrink_shift > 0)
3531 arc_shrink_shift = zfs_arc_shrink_shift;
3532
3533 if (zfs_arc_p_min_shift > 0)
3534 arc_p_min_shift = zfs_arc_p_min_shift;
3535
3536 /* if kmem_flags are set, lets try to use less memory */
3537 if (kmem_debugging())
3538 arc_c = arc_c / 2;
3539 if (arc_c < arc_c_min)
3540 arc_c = arc_c_min;
3541
3542 arc_anon = &ARC_anon;
3543 arc_mru = &ARC_mru;
3544 arc_mru_ghost = &ARC_mru_ghost;
3545 arc_mfu = &ARC_mfu;
3546 arc_mfu_ghost = &ARC_mfu_ghost;
3547 arc_l2c_only = &ARC_l2c_only;
3548 arc_size = 0;
3549
3550 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3551 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3552 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3553 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3554 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3555 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3556
3557 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3558 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3559 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3560 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3561 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3562 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3563 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3564 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3565 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3566 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3567 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3568 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3569 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3570 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3571 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3572 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3573 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3574 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3575 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3576 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3577
3578 buf_init();
3579
3580 arc_thread_exit = 0;
3581 arc_eviction_list = NULL;
3582 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3583 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3584
3585 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3586 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3587
3588 if (arc_ksp != NULL) {
3589 arc_ksp->ks_data = &arc_stats;
3590 kstat_install(arc_ksp);
3591 }
3592
3593 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3594 TS_RUN, maxclsyspri);
3595
3596 #if defined(__NetBSD__) && defined(_KERNEL)
3597 /* arc_hook.uvm_reclaim_hook = &arc_uvm_reclaim_hook;
3598
3599 uvm_reclaim_hook_add(&arc_hook);
3600 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
3601 &arc_kva_reclaim_entry, NULL, arc_kva_reclaim_callback); */
3602
3603 #endif
3604
3605 arc_dead = FALSE;
3606 arc_warm = B_FALSE;
3607
3608 if (zfs_write_limit_max == 0)
3609 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3610 else
3611 zfs_write_limit_shift = 0;
3612 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3613 }
3614
3615 void
arc_fini(void)3616 arc_fini(void)
3617 {
3618 mutex_enter(&arc_reclaim_thr_lock);
3619 arc_thread_exit = 1;
3620 while (arc_thread_exit != 0)
3621 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3622 mutex_exit(&arc_reclaim_thr_lock);
3623
3624 arc_flush(NULL);
3625
3626 arc_dead = TRUE;
3627
3628 if (arc_ksp != NULL) {
3629 kstat_delete(arc_ksp);
3630 arc_ksp = NULL;
3631 }
3632
3633 mutex_destroy(&arc_eviction_mtx);
3634 mutex_destroy(&arc_reclaim_thr_lock);
3635 cv_destroy(&arc_reclaim_thr_cv);
3636
3637 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3638 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3639 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3640 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3641 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3642 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3643 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3644 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3645
3646 mutex_destroy(&arc_anon->arcs_mtx);
3647 mutex_destroy(&arc_mru->arcs_mtx);
3648 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3649 mutex_destroy(&arc_mfu->arcs_mtx);
3650 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3651 mutex_destroy(&arc_l2c_only->arcs_mtx);
3652
3653 mutex_destroy(&zfs_write_limit_lock);
3654
3655 #if defined(__NetBSD__) && defined(_KERNEL)
3656 /* uvm_reclaim_hook_del(&arc_hook);
3657 callback_unregister(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
3658 &arc_kva_reclaim_entry); */
3659 #endif
3660
3661 buf_fini();
3662
3663 ASSERT(arc_loaned_bytes == 0);
3664 }
3665
3666 /*
3667 * Level 2 ARC
3668 *
3669 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3670 * It uses dedicated storage devices to hold cached data, which are populated
3671 * using large infrequent writes. The main role of this cache is to boost
3672 * the performance of random read workloads. The intended L2ARC devices
3673 * include short-stroked disks, solid state disks, and other media with
3674 * substantially faster read latency than disk.
3675 *
3676 * +-----------------------+
3677 * | ARC |
3678 * +-----------------------+
3679 * | ^ ^
3680 * | | |
3681 * l2arc_feed_thread() arc_read()
3682 * | | |
3683 * | l2arc read |
3684 * V | |
3685 * +---------------+ |
3686 * | L2ARC | |
3687 * +---------------+ |
3688 * | ^ |
3689 * l2arc_write() | |
3690 * | | |
3691 * V | |
3692 * +-------+ +-------+
3693 * | vdev | | vdev |
3694 * | cache | | cache |
3695 * +-------+ +-------+
3696 * +=========+ .-----.
3697 * : L2ARC : |-_____-|
3698 * : devices : | Disks |
3699 * +=========+ `-_____-'
3700 *
3701 * Read requests are satisfied from the following sources, in order:
3702 *
3703 * 1) ARC
3704 * 2) vdev cache of L2ARC devices
3705 * 3) L2ARC devices
3706 * 4) vdev cache of disks
3707 * 5) disks
3708 *
3709 * Some L2ARC device types exhibit extremely slow write performance.
3710 * To accommodate for this there are some significant differences between
3711 * the L2ARC and traditional cache design:
3712 *
3713 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
3714 * the ARC behave as usual, freeing buffers and placing headers on ghost
3715 * lists. The ARC does not send buffers to the L2ARC during eviction as
3716 * this would add inflated write latencies for all ARC memory pressure.
3717 *
3718 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3719 * It does this by periodically scanning buffers from the eviction-end of
3720 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3721 * not already there. It scans until a headroom of buffers is satisfied,
3722 * which itself is a buffer for ARC eviction. The thread that does this is
3723 * l2arc_feed_thread(), illustrated below; example sizes are included to
3724 * provide a better sense of ratio than this diagram:
3725 *
3726 * head --> tail
3727 * +---------------------+----------+
3728 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
3729 * +---------------------+----------+ | o L2ARC eligible
3730 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
3731 * +---------------------+----------+ |
3732 * 15.9 Gbytes ^ 32 Mbytes |
3733 * headroom |
3734 * l2arc_feed_thread()
3735 * |
3736 * l2arc write hand <--[oooo]--'
3737 * | 8 Mbyte
3738 * | write max
3739 * V
3740 * +==============================+
3741 * L2ARC dev |####|#|###|###| |####| ... |
3742 * +==============================+
3743 * 32 Gbytes
3744 *
3745 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3746 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3747 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
3748 * safe to say that this is an uncommon case, since buffers at the end of
3749 * the ARC lists have moved there due to inactivity.
3750 *
3751 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3752 * then the L2ARC simply misses copying some buffers. This serves as a
3753 * pressure valve to prevent heavy read workloads from both stalling the ARC
3754 * with waits and clogging the L2ARC with writes. This also helps prevent
3755 * the potential for the L2ARC to churn if it attempts to cache content too
3756 * quickly, such as during backups of the entire pool.
3757 *
3758 * 5. After system boot and before the ARC has filled main memory, there are
3759 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3760 * lists can remain mostly static. Instead of searching from tail of these
3761 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3762 * for eligible buffers, greatly increasing its chance of finding them.
3763 *
3764 * The L2ARC device write speed is also boosted during this time so that
3765 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
3766 * there are no L2ARC reads, and no fear of degrading read performance
3767 * through increased writes.
3768 *
3769 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3770 * the vdev queue can aggregate them into larger and fewer writes. Each
3771 * device is written to in a rotor fashion, sweeping writes through
3772 * available space then repeating.
3773 *
3774 * 7. The L2ARC does not store dirty content. It never needs to flush
3775 * write buffers back to disk based storage.
3776 *
3777 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3778 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3779 *
3780 * The performance of the L2ARC can be tweaked by a number of tunables, which
3781 * may be necessary for different workloads:
3782 *
3783 * l2arc_write_max max write bytes per interval
3784 * l2arc_write_boost extra write bytes during device warmup
3785 * l2arc_noprefetch skip caching prefetched buffers
3786 * l2arc_headroom number of max device writes to precache
3787 * l2arc_feed_secs seconds between L2ARC writing
3788 *
3789 * Tunables may be removed or added as future performance improvements are
3790 * integrated, and also may become zpool properties.
3791 *
3792 * There are three key functions that control how the L2ARC warms up:
3793 *
3794 * l2arc_write_eligible() check if a buffer is eligible to cache
3795 * l2arc_write_size() calculate how much to write
3796 * l2arc_write_interval() calculate sleep delay between writes
3797 *
3798 * These three functions determine what to write, how much, and how quickly
3799 * to send writes.
3800 */
3801
3802 static boolean_t
l2arc_write_eligible(uint64_t spa_guid,arc_buf_hdr_t * ab)3803 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3804 {
3805 /*
3806 * A buffer is *not* eligible for the L2ARC if it:
3807 * 1. belongs to a different spa.
3808 * 2. is already cached on the L2ARC.
3809 * 3. has an I/O in progress (it may be an incomplete read).
3810 * 4. is flagged not eligible (zfs property).
3811 */
3812 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3813 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3814 return (B_FALSE);
3815
3816 return (B_TRUE);
3817 }
3818
3819 static uint64_t
l2arc_write_size(l2arc_dev_t * dev)3820 l2arc_write_size(l2arc_dev_t *dev)
3821 {
3822 uint64_t size;
3823
3824 size = dev->l2ad_write;
3825
3826 if (arc_warm == B_FALSE)
3827 size += dev->l2ad_boost;
3828
3829 return (size);
3830
3831 }
3832
3833 static clock_t
l2arc_write_interval(clock_t began,uint64_t wanted,uint64_t wrote)3834 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3835 {
3836 clock_t interval, next, now;
3837
3838 /*
3839 * If the ARC lists are busy, increase our write rate; if the
3840 * lists are stale, idle back. This is achieved by checking
3841 * how much we previously wrote - if it was more than half of
3842 * what we wanted, schedule the next write much sooner.
3843 */
3844 if (l2arc_feed_again && wrote > (wanted / 2))
3845 interval = (hz * l2arc_feed_min_ms) / 1000;
3846 else
3847 interval = hz * l2arc_feed_secs;
3848
3849 now = ddi_get_lbolt();
3850 next = MAX(now, MIN(now + interval, began + interval));
3851
3852 return (next);
3853 }
3854
3855 static void
l2arc_hdr_stat_add(void)3856 l2arc_hdr_stat_add(void)
3857 {
3858 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3859 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3860 }
3861
3862 static void
l2arc_hdr_stat_remove(void)3863 l2arc_hdr_stat_remove(void)
3864 {
3865 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3866 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3867 }
3868
3869 /*
3870 * Cycle through L2ARC devices. This is how L2ARC load balances.
3871 * If a device is returned, this also returns holding the spa config lock.
3872 */
3873 static l2arc_dev_t *
l2arc_dev_get_next(void)3874 l2arc_dev_get_next(void)
3875 {
3876 l2arc_dev_t *first, *next = NULL;
3877
3878 /*
3879 * Lock out the removal of spas (spa_namespace_lock), then removal
3880 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
3881 * both locks will be dropped and a spa config lock held instead.
3882 */
3883 mutex_enter(&spa_namespace_lock);
3884 mutex_enter(&l2arc_dev_mtx);
3885
3886 /* if there are no vdevs, there is nothing to do */
3887 if (l2arc_ndev == 0)
3888 goto out;
3889
3890 first = NULL;
3891 next = l2arc_dev_last;
3892 do {
3893 /* loop around the list looking for a non-faulted vdev */
3894 if (next == NULL) {
3895 next = list_head(l2arc_dev_list);
3896 } else {
3897 next = list_next(l2arc_dev_list, next);
3898 if (next == NULL)
3899 next = list_head(l2arc_dev_list);
3900 }
3901
3902 /* if we have come back to the start, bail out */
3903 if (first == NULL)
3904 first = next;
3905 else if (next == first)
3906 break;
3907
3908 } while (vdev_is_dead(next->l2ad_vdev));
3909
3910 /* if we were unable to find any usable vdevs, return NULL */
3911 if (vdev_is_dead(next->l2ad_vdev))
3912 next = NULL;
3913
3914 l2arc_dev_last = next;
3915
3916 out:
3917 mutex_exit(&l2arc_dev_mtx);
3918
3919 /*
3920 * Grab the config lock to prevent the 'next' device from being
3921 * removed while we are writing to it.
3922 */
3923 if (next != NULL)
3924 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3925 mutex_exit(&spa_namespace_lock);
3926
3927 return (next);
3928 }
3929
3930 /*
3931 * Free buffers that were tagged for destruction.
3932 */
3933 static void
l2arc_do_free_on_write()3934 l2arc_do_free_on_write()
3935 {
3936 list_t *buflist;
3937 l2arc_data_free_t *df, *df_prev;
3938
3939 mutex_enter(&l2arc_free_on_write_mtx);
3940 buflist = l2arc_free_on_write;
3941
3942 for (df = list_tail(buflist); df; df = df_prev) {
3943 df_prev = list_prev(buflist, df);
3944 ASSERT(df->l2df_data != NULL);
3945 ASSERT(df->l2df_func != NULL);
3946 df->l2df_func(df->l2df_data, df->l2df_size);
3947 list_remove(buflist, df);
3948 kmem_free(df, sizeof (l2arc_data_free_t));
3949 }
3950
3951 mutex_exit(&l2arc_free_on_write_mtx);
3952 }
3953
3954 /*
3955 * A write to a cache device has completed. Update all headers to allow
3956 * reads from these buffers to begin.
3957 */
3958 static void
l2arc_write_done(zio_t * zio)3959 l2arc_write_done(zio_t *zio)
3960 {
3961 l2arc_write_callback_t *cb;
3962 l2arc_dev_t *dev;
3963 list_t *buflist;
3964 arc_buf_hdr_t *head, *ab, *ab_prev;
3965 l2arc_buf_hdr_t *abl2;
3966 kmutex_t *hash_lock;
3967
3968 cb = zio->io_private;
3969 ASSERT(cb != NULL);
3970 dev = cb->l2wcb_dev;
3971 ASSERT(dev != NULL);
3972 head = cb->l2wcb_head;
3973 ASSERT(head != NULL);
3974 buflist = dev->l2ad_buflist;
3975 ASSERT(buflist != NULL);
3976 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
3977 l2arc_write_callback_t *, cb);
3978
3979 if (zio->io_error != 0)
3980 ARCSTAT_BUMP(arcstat_l2_writes_error);
3981
3982 mutex_enter(&l2arc_buflist_mtx);
3983
3984 /*
3985 * All writes completed, or an error was hit.
3986 */
3987 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
3988 ab_prev = list_prev(buflist, ab);
3989
3990 hash_lock = HDR_LOCK(ab);
3991 if (!mutex_tryenter(hash_lock)) {
3992 /*
3993 * This buffer misses out. It may be in a stage
3994 * of eviction. Its ARC_L2_WRITING flag will be
3995 * left set, denying reads to this buffer.
3996 */
3997 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
3998 continue;
3999 }
4000
4001 if (zio->io_error != 0) {
4002 /*
4003 * Error - drop L2ARC entry.
4004 */
4005 list_remove(buflist, ab);
4006 abl2 = ab->b_l2hdr;
4007 ab->b_l2hdr = NULL;
4008 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4009 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4010 }
4011
4012 /*
4013 * Allow ARC to begin reads to this L2ARC entry.
4014 */
4015 ab->b_flags &= ~ARC_L2_WRITING;
4016
4017 mutex_exit(hash_lock);
4018 }
4019
4020 atomic_inc_64(&l2arc_writes_done);
4021 list_remove(buflist, head);
4022 kmem_cache_free(hdr_cache, head);
4023 mutex_exit(&l2arc_buflist_mtx);
4024
4025 l2arc_do_free_on_write();
4026
4027 kmem_free(cb, sizeof (l2arc_write_callback_t));
4028 }
4029
4030 /*
4031 * A read to a cache device completed. Validate buffer contents before
4032 * handing over to the regular ARC routines.
4033 */
4034 static void
l2arc_read_done(zio_t * zio)4035 l2arc_read_done(zio_t *zio)
4036 {
4037 l2arc_read_callback_t *cb;
4038 arc_buf_hdr_t *hdr;
4039 arc_buf_t *buf;
4040 kmutex_t *hash_lock;
4041 int equal;
4042
4043 ASSERT(zio->io_vd != NULL);
4044 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4045
4046 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4047
4048 cb = zio->io_private;
4049 ASSERT(cb != NULL);
4050 buf = cb->l2rcb_buf;
4051 ASSERT(buf != NULL);
4052 hdr = buf->b_hdr;
4053 ASSERT(hdr != NULL);
4054
4055 hash_lock = HDR_LOCK(hdr);
4056 mutex_enter(hash_lock);
4057
4058 /*
4059 * Check this survived the L2ARC journey.
4060 */
4061 equal = arc_cksum_equal(buf);
4062 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4063 mutex_exit(hash_lock);
4064 zio->io_private = buf;
4065 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4066 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4067 arc_read_done(zio);
4068 } else {
4069 mutex_exit(hash_lock);
4070 /*
4071 * Buffer didn't survive caching. Increment stats and
4072 * reissue to the original storage device.
4073 */
4074 if (zio->io_error != 0) {
4075 ARCSTAT_BUMP(arcstat_l2_io_error);
4076 } else {
4077 zio->io_error = EIO;
4078 }
4079 if (!equal)
4080 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4081
4082 /*
4083 * If there's no waiter, issue an async i/o to the primary
4084 * storage now. If there *is* a waiter, the caller must
4085 * issue the i/o in a context where it's OK to block.
4086 */
4087 if (zio->io_waiter == NULL) {
4088 zio_t *pio = zio_unique_parent(zio);
4089
4090 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4091
4092 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4093 buf->b_data, zio->io_size, arc_read_done, buf,
4094 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4095 }
4096 }
4097
4098 kmem_free(cb, sizeof (l2arc_read_callback_t));
4099 }
4100
4101 /*
4102 * This is the list priority from which the L2ARC will search for pages to
4103 * cache. This is used within loops (0..3) to cycle through lists in the
4104 * desired order. This order can have a significant effect on cache
4105 * performance.
4106 *
4107 * Currently the metadata lists are hit first, MFU then MRU, followed by
4108 * the data lists. This function returns a locked list, and also returns
4109 * the lock pointer.
4110 */
4111 static list_t *
l2arc_list_locked(int list_num,kmutex_t ** lock)4112 l2arc_list_locked(int list_num, kmutex_t **lock)
4113 {
4114 list_t *list;
4115
4116 ASSERT(list_num >= 0 && list_num <= 3);
4117
4118 switch (list_num) {
4119 case 0:
4120 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4121 *lock = &arc_mfu->arcs_mtx;
4122 break;
4123 case 1:
4124 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4125 *lock = &arc_mru->arcs_mtx;
4126 break;
4127 case 2:
4128 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4129 *lock = &arc_mfu->arcs_mtx;
4130 break;
4131 case 3:
4132 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4133 *lock = &arc_mru->arcs_mtx;
4134 break;
4135 }
4136
4137 ASSERT(!(MUTEX_HELD(*lock)));
4138 mutex_enter(*lock);
4139 return (list);
4140 }
4141
4142 /*
4143 * Evict buffers from the device write hand to the distance specified in
4144 * bytes. This distance may span populated buffers, it may span nothing.
4145 * This is clearing a region on the L2ARC device ready for writing.
4146 * If the 'all' boolean is set, every buffer is evicted.
4147 */
4148 static void
l2arc_evict(l2arc_dev_t * dev,uint64_t distance,boolean_t all)4149 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4150 {
4151 list_t *buflist;
4152 l2arc_buf_hdr_t *abl2;
4153 arc_buf_hdr_t *ab, *ab_prev;
4154 kmutex_t *hash_lock;
4155 uint64_t taddr;
4156
4157 buflist = dev->l2ad_buflist;
4158
4159 if (buflist == NULL)
4160 return;
4161
4162 if (!all && dev->l2ad_first) {
4163 /*
4164 * This is the first sweep through the device. There is
4165 * nothing to evict.
4166 */
4167 return;
4168 }
4169
4170 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4171 /*
4172 * When nearing the end of the device, evict to the end
4173 * before the device write hand jumps to the start.
4174 */
4175 taddr = dev->l2ad_end;
4176 } else {
4177 taddr = dev->l2ad_hand + distance;
4178 }
4179 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4180 uint64_t, taddr, boolean_t, all);
4181
4182 top:
4183 mutex_enter(&l2arc_buflist_mtx);
4184 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4185 ab_prev = list_prev(buflist, ab);
4186
4187 hash_lock = HDR_LOCK(ab);
4188 if (!mutex_tryenter(hash_lock)) {
4189 /*
4190 * Missed the hash lock. Retry.
4191 */
4192 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4193 mutex_exit(&l2arc_buflist_mtx);
4194 mutex_enter(hash_lock);
4195 mutex_exit(hash_lock);
4196 goto top;
4197 }
4198
4199 if (HDR_L2_WRITE_HEAD(ab)) {
4200 /*
4201 * We hit a write head node. Leave it for
4202 * l2arc_write_done().
4203 */
4204 list_remove(buflist, ab);
4205 mutex_exit(hash_lock);
4206 continue;
4207 }
4208
4209 if (!all && ab->b_l2hdr != NULL &&
4210 (ab->b_l2hdr->b_daddr > taddr ||
4211 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4212 /*
4213 * We've evicted to the target address,
4214 * or the end of the device.
4215 */
4216 mutex_exit(hash_lock);
4217 break;
4218 }
4219
4220 if (HDR_FREE_IN_PROGRESS(ab)) {
4221 /*
4222 * Already on the path to destruction.
4223 */
4224 mutex_exit(hash_lock);
4225 continue;
4226 }
4227
4228 if (ab->b_state == arc_l2c_only) {
4229 ASSERT(!HDR_L2_READING(ab));
4230 /*
4231 * This doesn't exist in the ARC. Destroy.
4232 * arc_hdr_destroy() will call list_remove()
4233 * and decrement arcstat_l2_size.
4234 */
4235 arc_change_state(arc_anon, ab, hash_lock);
4236 arc_hdr_destroy(ab);
4237 } else {
4238 /*
4239 * Invalidate issued or about to be issued
4240 * reads, since we may be about to write
4241 * over this location.
4242 */
4243 if (HDR_L2_READING(ab)) {
4244 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4245 ab->b_flags |= ARC_L2_EVICTED;
4246 }
4247
4248 /*
4249 * Tell ARC this no longer exists in L2ARC.
4250 */
4251 if (ab->b_l2hdr != NULL) {
4252 abl2 = ab->b_l2hdr;
4253 ab->b_l2hdr = NULL;
4254 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4255 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4256 }
4257 list_remove(buflist, ab);
4258
4259 /*
4260 * This may have been leftover after a
4261 * failed write.
4262 */
4263 ab->b_flags &= ~ARC_L2_WRITING;
4264 }
4265 mutex_exit(hash_lock);
4266 }
4267 mutex_exit(&l2arc_buflist_mtx);
4268
4269 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4270 dev->l2ad_evict = taddr;
4271 }
4272
4273 /*
4274 * Find and write ARC buffers to the L2ARC device.
4275 *
4276 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4277 * for reading until they have completed writing.
4278 */
4279 static uint64_t
l2arc_write_buffers(spa_t * spa,l2arc_dev_t * dev,uint64_t target_sz)4280 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4281 {
4282 arc_buf_hdr_t *ab, *ab_prev, *head;
4283 l2arc_buf_hdr_t *hdrl2;
4284 list_t *list;
4285 uint64_t passed_sz, write_sz, buf_sz, headroom;
4286 void *buf_data;
4287 kmutex_t *hash_lock, *list_lock;
4288 boolean_t have_lock, full;
4289 l2arc_write_callback_t *cb;
4290 zio_t *pio, *wzio;
4291 uint64_t guid = spa_guid(spa);
4292
4293 ASSERT(dev->l2ad_vdev != NULL);
4294
4295 pio = NULL;
4296 write_sz = 0;
4297 full = B_FALSE;
4298 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4299 head->b_flags |= ARC_L2_WRITE_HEAD;
4300
4301 /*
4302 * Copy buffers for L2ARC writing.
4303 */
4304 mutex_enter(&l2arc_buflist_mtx);
4305 for (int try = 0; try <= 3; try++) {
4306 list = l2arc_list_locked(try, &list_lock);
4307 passed_sz = 0;
4308
4309 /*
4310 * L2ARC fast warmup.
4311 *
4312 * Until the ARC is warm and starts to evict, read from the
4313 * head of the ARC lists rather than the tail.
4314 */
4315 headroom = target_sz * l2arc_headroom;
4316 if (arc_warm == B_FALSE)
4317 ab = list_head(list);
4318 else
4319 ab = list_tail(list);
4320
4321 for (; ab; ab = ab_prev) {
4322 if (arc_warm == B_FALSE)
4323 ab_prev = list_next(list, ab);
4324 else
4325 ab_prev = list_prev(list, ab);
4326
4327 hash_lock = HDR_LOCK(ab);
4328 have_lock = MUTEX_HELD(hash_lock);
4329 if (!have_lock && !mutex_tryenter(hash_lock)) {
4330 /*
4331 * Skip this buffer rather than waiting.
4332 */
4333 continue;
4334 }
4335
4336 passed_sz += ab->b_size;
4337 if (passed_sz > headroom) {
4338 /*
4339 * Searched too far.
4340 */
4341 mutex_exit(hash_lock);
4342 break;
4343 }
4344
4345 if (!l2arc_write_eligible(guid, ab)) {
4346 mutex_exit(hash_lock);
4347 continue;
4348 }
4349
4350 if ((write_sz + ab->b_size) > target_sz) {
4351 full = B_TRUE;
4352 mutex_exit(hash_lock);
4353 break;
4354 }
4355
4356 if (pio == NULL) {
4357 /*
4358 * Insert a dummy header on the buflist so
4359 * l2arc_write_done() can find where the
4360 * write buffers begin without searching.
4361 */
4362 list_insert_head(dev->l2ad_buflist, head);
4363
4364 cb = kmem_alloc(
4365 sizeof (l2arc_write_callback_t), KM_SLEEP);
4366 cb->l2wcb_dev = dev;
4367 cb->l2wcb_head = head;
4368 pio = zio_root(spa, l2arc_write_done, cb,
4369 ZIO_FLAG_CANFAIL);
4370 }
4371
4372 /*
4373 * Create and add a new L2ARC header.
4374 */
4375 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4376 hdrl2->b_dev = dev;
4377 hdrl2->b_daddr = dev->l2ad_hand;
4378
4379 ab->b_flags |= ARC_L2_WRITING;
4380 ab->b_l2hdr = hdrl2;
4381 list_insert_head(dev->l2ad_buflist, ab);
4382 buf_data = ab->b_buf->b_data;
4383 buf_sz = ab->b_size;
4384
4385 /*
4386 * Compute and store the buffer cksum before
4387 * writing. On debug the cksum is verified first.
4388 */
4389 arc_cksum_verify(ab->b_buf);
4390 arc_cksum_compute(ab->b_buf, B_TRUE);
4391
4392 mutex_exit(hash_lock);
4393
4394 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4395 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4396 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4397 ZIO_FLAG_CANFAIL, B_FALSE);
4398
4399 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4400 zio_t *, wzio);
4401 (void) zio_nowait(wzio);
4402
4403 /*
4404 * Keep the clock hand suitably device-aligned.
4405 */
4406 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4407
4408 write_sz += buf_sz;
4409 dev->l2ad_hand += buf_sz;
4410 }
4411
4412 mutex_exit(list_lock);
4413
4414 if (full == B_TRUE)
4415 break;
4416 }
4417 mutex_exit(&l2arc_buflist_mtx);
4418
4419 if (pio == NULL) {
4420 ASSERT3U(write_sz, ==, 0);
4421 kmem_cache_free(hdr_cache, head);
4422 return (0);
4423 }
4424
4425 ASSERT3U(write_sz, <=, target_sz);
4426 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4427 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4428 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4429 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4430
4431 /*
4432 * Bump device hand to the device start if it is approaching the end.
4433 * l2arc_evict() will already have evicted ahead for this case.
4434 */
4435 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4436 vdev_space_update(dev->l2ad_vdev,
4437 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4438 dev->l2ad_hand = dev->l2ad_start;
4439 dev->l2ad_evict = dev->l2ad_start;
4440 dev->l2ad_first = B_FALSE;
4441 }
4442
4443 dev->l2ad_writing = B_TRUE;
4444 (void) zio_wait(pio);
4445 dev->l2ad_writing = B_FALSE;
4446
4447 return (write_sz);
4448 }
4449
4450 /*
4451 * This thread feeds the L2ARC at regular intervals. This is the beating
4452 * heart of the L2ARC.
4453 */
4454 static void
l2arc_feed_thread(void * unused __unused)4455 l2arc_feed_thread(void *unused __unused)
4456 {
4457 callb_cpr_t cpr;
4458 l2arc_dev_t *dev;
4459 spa_t *spa;
4460 uint64_t size, wrote;
4461 clock_t begin, next = ddi_get_lbolt();
4462
4463 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4464
4465 mutex_enter(&l2arc_feed_thr_lock);
4466
4467 while (l2arc_thread_exit == 0) {
4468 CALLB_CPR_SAFE_BEGIN(&cpr);
4469 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4470 (hz * l2arc_feed_secs));
4471 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4472 next = ddi_get_lbolt();
4473
4474 /*
4475 * Quick check for L2ARC devices.
4476 */
4477 mutex_enter(&l2arc_dev_mtx);
4478 if (l2arc_ndev == 0) {
4479 mutex_exit(&l2arc_dev_mtx);
4480 continue;
4481 }
4482 mutex_exit(&l2arc_dev_mtx);
4483 begin = ddi_get_lbolt();
4484
4485 /*
4486 * This selects the next l2arc device to write to, and in
4487 * doing so the next spa to feed from: dev->l2ad_spa. This
4488 * will return NULL if there are now no l2arc devices or if
4489 * they are all faulted.
4490 *
4491 * If a device is returned, its spa's config lock is also
4492 * held to prevent device removal. l2arc_dev_get_next()
4493 * will grab and release l2arc_dev_mtx.
4494 */
4495 if ((dev = l2arc_dev_get_next()) == NULL)
4496 continue;
4497
4498 spa = dev->l2ad_spa;
4499 ASSERT(spa != NULL);
4500
4501 /*
4502 * Avoid contributing to memory pressure.
4503 */
4504 if (arc_reclaim_needed()) {
4505 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4506 spa_config_exit(spa, SCL_L2ARC, dev);
4507 continue;
4508 }
4509
4510 ARCSTAT_BUMP(arcstat_l2_feeds);
4511
4512 size = l2arc_write_size(dev);
4513
4514 /*
4515 * Evict L2ARC buffers that will be overwritten.
4516 */
4517 l2arc_evict(dev, size, B_FALSE);
4518
4519 /*
4520 * Write ARC buffers.
4521 */
4522 wrote = l2arc_write_buffers(spa, dev, size);
4523
4524 /*
4525 * Calculate interval between writes.
4526 */
4527 next = l2arc_write_interval(begin, size, wrote);
4528 spa_config_exit(spa, SCL_L2ARC, dev);
4529 }
4530
4531 l2arc_thread_exit = 0;
4532 cv_broadcast(&l2arc_feed_thr_cv);
4533 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
4534 thread_exit();
4535 }
4536
4537 boolean_t
l2arc_vdev_present(vdev_t * vd)4538 l2arc_vdev_present(vdev_t *vd)
4539 {
4540 l2arc_dev_t *dev;
4541
4542 mutex_enter(&l2arc_dev_mtx);
4543 for (dev = list_head(l2arc_dev_list); dev != NULL;
4544 dev = list_next(l2arc_dev_list, dev)) {
4545 if (dev->l2ad_vdev == vd)
4546 break;
4547 }
4548 mutex_exit(&l2arc_dev_mtx);
4549
4550 return (dev != NULL);
4551 }
4552
4553 /*
4554 * Add a vdev for use by the L2ARC. By this point the spa has already
4555 * validated the vdev and opened it.
4556 */
4557 void
l2arc_add_vdev(spa_t * spa,vdev_t * vd)4558 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4559 {
4560 l2arc_dev_t *adddev;
4561
4562 ASSERT(!l2arc_vdev_present(vd));
4563
4564 /*
4565 * Create a new l2arc device entry.
4566 */
4567 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4568 adddev->l2ad_spa = spa;
4569 adddev->l2ad_vdev = vd;
4570 adddev->l2ad_write = l2arc_write_max;
4571 adddev->l2ad_boost = l2arc_write_boost;
4572 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4573 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4574 adddev->l2ad_hand = adddev->l2ad_start;
4575 adddev->l2ad_evict = adddev->l2ad_start;
4576 adddev->l2ad_first = B_TRUE;
4577 adddev->l2ad_writing = B_FALSE;
4578 ASSERT3U(adddev->l2ad_write, >, 0);
4579
4580 /*
4581 * This is a list of all ARC buffers that are still valid on the
4582 * device.
4583 */
4584 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4585 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4586 offsetof(arc_buf_hdr_t, b_l2node));
4587
4588 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4589
4590 /*
4591 * Add device to global list
4592 */
4593 mutex_enter(&l2arc_dev_mtx);
4594 list_insert_head(l2arc_dev_list, adddev);
4595 atomic_inc_64(&l2arc_ndev);
4596 mutex_exit(&l2arc_dev_mtx);
4597 }
4598
4599 /*
4600 * Remove a vdev from the L2ARC.
4601 */
4602 void
l2arc_remove_vdev(vdev_t * vd)4603 l2arc_remove_vdev(vdev_t *vd)
4604 {
4605 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4606
4607 /*
4608 * Find the device by vdev
4609 */
4610 mutex_enter(&l2arc_dev_mtx);
4611 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4612 nextdev = list_next(l2arc_dev_list, dev);
4613 if (vd == dev->l2ad_vdev) {
4614 remdev = dev;
4615 break;
4616 }
4617 }
4618 ASSERT(remdev != NULL);
4619
4620 /*
4621 * Remove device from global list
4622 */
4623 list_remove(l2arc_dev_list, remdev);
4624 l2arc_dev_last = NULL; /* may have been invalidated */
4625 atomic_dec_64(&l2arc_ndev);
4626 mutex_exit(&l2arc_dev_mtx);
4627
4628 /*
4629 * Clear all buflists and ARC references. L2ARC device flush.
4630 */
4631 l2arc_evict(remdev, 0, B_TRUE);
4632 list_destroy(remdev->l2ad_buflist);
4633 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4634 kmem_free(remdev, sizeof (l2arc_dev_t));
4635 }
4636
4637 void
l2arc_init(void)4638 l2arc_init(void)
4639 {
4640 l2arc_thread_exit = 0;
4641 l2arc_ndev = 0;
4642 l2arc_writes_sent = 0;
4643 l2arc_writes_done = 0;
4644
4645 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4646 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4647 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4648 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4649 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4650
4651 l2arc_dev_list = &L2ARC_dev_list;
4652 l2arc_free_on_write = &L2ARC_free_on_write;
4653 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4654 offsetof(l2arc_dev_t, l2ad_node));
4655 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4656 offsetof(l2arc_data_free_t, l2df_list_node));
4657 }
4658
4659 void
l2arc_fini(void)4660 l2arc_fini(void)
4661 {
4662 /*
4663 * This is called from dmu_fini(), which is called from spa_fini();
4664 * Because of this, we can assume that all l2arc devices have
4665 * already been removed when the pools themselves were removed.
4666 */
4667
4668 l2arc_do_free_on_write();
4669
4670 mutex_destroy(&l2arc_feed_thr_lock);
4671 cv_destroy(&l2arc_feed_thr_cv);
4672 mutex_destroy(&l2arc_dev_mtx);
4673 mutex_destroy(&l2arc_buflist_mtx);
4674 mutex_destroy(&l2arc_free_on_write_mtx);
4675
4676 list_destroy(l2arc_dev_list);
4677 list_destroy(l2arc_free_on_write);
4678 }
4679
4680 void
l2arc_start(void)4681 l2arc_start(void)
4682 {
4683 if (!(spa_mode_global & FWRITE))
4684 return;
4685
4686 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4687 TS_RUN, minclsyspri);
4688 }
4689
4690 void
l2arc_stop(void)4691 l2arc_stop(void)
4692 {
4693 if (!(spa_mode_global & FWRITE))
4694 return;
4695
4696 mutex_enter(&l2arc_feed_thr_lock);
4697 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
4698 l2arc_thread_exit = 1;
4699 while (l2arc_thread_exit != 0)
4700 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4701 mutex_exit(&l2arc_feed_thr_lock);
4702 }
4703