1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_zfetch.h>
34 #include <sys/dmu.h>
35 #include <sys/dbuf.h>
36 #include <sys/kstat.h>
37 #include <sys/wmsum.h>
38 
39 /*
40  * This tunable disables predictive prefetch.  Note that it leaves "prescient"
41  * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
42  * prescient prefetch never issues i/os that end up not being needed,
43  * so it can't hurt performance.
44  */
45 
46 static int zfs_prefetch_disable = B_FALSE;
47 
48 /* max # of streams per zfetch */
49 static unsigned int	zfetch_max_streams = 8;
50 /* min time before stream reclaim */
51 static unsigned int	zfetch_min_sec_reap = 1;
52 /* max time before stream delete */
53 static unsigned int	zfetch_max_sec_reap = 2;
54 /* min bytes to prefetch per stream (default 4MB) */
55 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
56 /* max bytes to prefetch per stream (default 64MB) */
57 unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
58 /* max bytes to prefetch indirects for per stream (default 64MB) */
59 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
60 /* max number of bytes in an array_read in which we allow prefetching (1MB) */
61 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
62 
63 typedef struct zfetch_stats {
64 	kstat_named_t zfetchstat_hits;
65 	kstat_named_t zfetchstat_misses;
66 	kstat_named_t zfetchstat_max_streams;
67 	kstat_named_t zfetchstat_io_issued;
68 } zfetch_stats_t;
69 
70 static zfetch_stats_t zfetch_stats = {
71 	{ "hits",			KSTAT_DATA_UINT64 },
72 	{ "misses",			KSTAT_DATA_UINT64 },
73 	{ "max_streams",		KSTAT_DATA_UINT64 },
74 	{ "io_issued",		KSTAT_DATA_UINT64 },
75 };
76 
77 struct {
78 	wmsum_t zfetchstat_hits;
79 	wmsum_t zfetchstat_misses;
80 	wmsum_t zfetchstat_max_streams;
81 	wmsum_t zfetchstat_io_issued;
82 } zfetch_sums;
83 
84 #define	ZFETCHSTAT_BUMP(stat)					\
85 	wmsum_add(&zfetch_sums.stat, 1)
86 #define	ZFETCHSTAT_ADD(stat, val)				\
87 	wmsum_add(&zfetch_sums.stat, val)
88 
89 
90 static kstat_t		*zfetch_ksp;
91 
92 static int
93 zfetch_kstats_update(kstat_t *ksp, int rw)
94 {
95 	zfetch_stats_t *zs = ksp->ks_data;
96 
97 	if (rw == KSTAT_WRITE)
98 		return (EACCES);
99 	zs->zfetchstat_hits.value.ui64 =
100 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
101 	zs->zfetchstat_misses.value.ui64 =
102 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
103 	zs->zfetchstat_max_streams.value.ui64 =
104 	    wmsum_value(&zfetch_sums.zfetchstat_max_streams);
105 	zs->zfetchstat_io_issued.value.ui64 =
106 	    wmsum_value(&zfetch_sums.zfetchstat_io_issued);
107 	return (0);
108 }
109 
110 void
111 zfetch_init(void)
112 {
113 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
114 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
115 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
116 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
117 
118 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
119 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
120 	    KSTAT_FLAG_VIRTUAL);
121 
122 	if (zfetch_ksp != NULL) {
123 		zfetch_ksp->ks_data = &zfetch_stats;
124 		zfetch_ksp->ks_update = zfetch_kstats_update;
125 		kstat_install(zfetch_ksp);
126 	}
127 }
128 
129 void
130 zfetch_fini(void)
131 {
132 	if (zfetch_ksp != NULL) {
133 		kstat_delete(zfetch_ksp);
134 		zfetch_ksp = NULL;
135 	}
136 
137 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
138 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
139 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
140 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
141 }
142 
143 /*
144  * This takes a pointer to a zfetch structure and a dnode.  It performs the
145  * necessary setup for the zfetch structure, grokking data from the
146  * associated dnode.
147  */
148 void
149 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
150 {
151 	if (zf == NULL)
152 		return;
153 	zf->zf_dnode = dno;
154 	zf->zf_numstreams = 0;
155 
156 	list_create(&zf->zf_stream, sizeof (zstream_t),
157 	    offsetof(zstream_t, zs_node));
158 
159 	mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
160 }
161 
162 static void
163 dmu_zfetch_stream_fini(zstream_t *zs)
164 {
165 	ASSERT(!list_link_active(&zs->zs_node));
166 	zfs_refcount_destroy(&zs->zs_callers);
167 	zfs_refcount_destroy(&zs->zs_refs);
168 	kmem_free(zs, sizeof (*zs));
169 }
170 
171 static void
172 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
173 {
174 	ASSERT(MUTEX_HELD(&zf->zf_lock));
175 	list_remove(&zf->zf_stream, zs);
176 	zf->zf_numstreams--;
177 	membar_producer();
178 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
179 		dmu_zfetch_stream_fini(zs);
180 }
181 
182 /*
183  * Clean-up state associated with a zfetch structure (e.g. destroy the
184  * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
185  */
186 void
187 dmu_zfetch_fini(zfetch_t *zf)
188 {
189 	zstream_t *zs;
190 
191 	mutex_enter(&zf->zf_lock);
192 	while ((zs = list_head(&zf->zf_stream)) != NULL)
193 		dmu_zfetch_stream_remove(zf, zs);
194 	mutex_exit(&zf->zf_lock);
195 	list_destroy(&zf->zf_stream);
196 	mutex_destroy(&zf->zf_lock);
197 
198 	zf->zf_dnode = NULL;
199 }
200 
201 /*
202  * If there aren't too many active streams already, create one more.
203  * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
204  * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
205  * The "blkid" argument is the next block that we expect this stream to access.
206  */
207 static void
208 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
209 {
210 	zstream_t *zs, *zs_next, *zs_old = NULL;
211 	hrtime_t now = gethrtime(), t;
212 
213 	ASSERT(MUTEX_HELD(&zf->zf_lock));
214 
215 	/*
216 	 * Delete too old streams, reusing the first found one.
217 	 */
218 	t = now - SEC2NSEC(zfetch_max_sec_reap);
219 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
220 		zs_next = list_next(&zf->zf_stream, zs);
221 		/*
222 		 * Skip if still active.  1 -- zf_stream reference.
223 		 */
224 		if (zfs_refcount_count(&zs->zs_refs) != 1)
225 			continue;
226 		if (zs->zs_atime > t)
227 			continue;
228 		if (zs_old)
229 			dmu_zfetch_stream_remove(zf, zs);
230 		else
231 			zs_old = zs;
232 	}
233 	if (zs_old) {
234 		zs = zs_old;
235 		goto reuse;
236 	}
237 
238 	/*
239 	 * The maximum number of streams is normally zfetch_max_streams,
240 	 * but for small files we lower it such that it's at least possible
241 	 * for all the streams to be non-overlapping.
242 	 */
243 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
244 	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
245 	    zfetch_max_distance));
246 	if (zf->zf_numstreams >= max_streams) {
247 		t = now - SEC2NSEC(zfetch_min_sec_reap);
248 		for (zs = list_head(&zf->zf_stream); zs != NULL;
249 		    zs = list_next(&zf->zf_stream, zs)) {
250 			if (zfs_refcount_count(&zs->zs_refs) != 1)
251 				continue;
252 			if (zs->zs_atime > t)
253 				continue;
254 			if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
255 				zs_old = zs;
256 		}
257 		if (zs_old) {
258 			zs = zs_old;
259 			goto reuse;
260 		}
261 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
262 		return;
263 	}
264 
265 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
266 	zs->zs_fetch = zf;
267 	zfs_refcount_create(&zs->zs_callers);
268 	zfs_refcount_create(&zs->zs_refs);
269 	/* One reference for zf_stream. */
270 	zfs_refcount_add(&zs->zs_refs, NULL);
271 	zf->zf_numstreams++;
272 	list_insert_head(&zf->zf_stream, zs);
273 
274 reuse:
275 	zs->zs_blkid = blkid;
276 	zs->zs_pf_dist = 0;
277 	zs->zs_pf_start = blkid;
278 	zs->zs_pf_end = blkid;
279 	zs->zs_ipf_dist = 0;
280 	zs->zs_ipf_start = blkid;
281 	zs->zs_ipf_end = blkid;
282 	/* Allow immediate stream reuse until first hit. */
283 	zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
284 	zs->zs_missed = B_FALSE;
285 	zs->zs_more = B_FALSE;
286 }
287 
288 static void
289 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
290 {
291 	zstream_t *zs = arg;
292 
293 	if (io_issued && level == 0 && blkid < zs->zs_blkid)
294 		zs->zs_more = B_TRUE;
295 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
296 		dmu_zfetch_stream_fini(zs);
297 }
298 
299 /*
300  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
301  * associates dnode access specified with blkid and nblks arguments with
302  * prefetch stream, predicts further accesses based on that stats and returns
303  * the stream pointer on success.  That pointer must later be passed to
304  * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
305  * release it.  dmu_zfetch() is a wrapper for simple cases when window between
306  * prediction and prefetch initiation is not needed.
307  * fetch_data argument specifies whether actual data blocks should be fetched:
308  *   FALSE -- prefetch only indirect blocks for predicted data blocks;
309  *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
310  */
311 zstream_t *
312 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
313     boolean_t fetch_data, boolean_t have_lock)
314 {
315 	zstream_t *zs;
316 	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
317 
318 	if (zfs_prefetch_disable)
319 		return (NULL);
320 	/*
321 	 * If we haven't yet loaded the indirect vdevs' mappings, we
322 	 * can only read from blocks that we carefully ensure are on
323 	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
324 	 * can't allow the predictive prefetcher to attempt reads of other
325 	 * blocks (e.g. of the MOS's dnode object).
326 	 */
327 	if (!spa_indirect_vdevs_loaded(spa))
328 		return (NULL);
329 
330 	/*
331 	 * As a fast path for small (single-block) files, ignore access
332 	 * to the first block.
333 	 */
334 	if (!have_lock && blkid == 0)
335 		return (NULL);
336 
337 	if (!have_lock)
338 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
339 
340 	/*
341 	 * A fast path for small files for which no prefetch will
342 	 * happen.
343 	 */
344 	uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
345 	if (maxblkid < 2) {
346 		if (!have_lock)
347 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
348 		return (NULL);
349 	}
350 	mutex_enter(&zf->zf_lock);
351 
352 	/*
353 	 * Find matching prefetch stream.  Depending on whether the accesses
354 	 * are block-aligned, first block of the new access may either follow
355 	 * the last block of the previous access, or be equal to it.
356 	 */
357 	for (zs = list_head(&zf->zf_stream); zs != NULL;
358 	    zs = list_next(&zf->zf_stream, zs)) {
359 		if (blkid == zs->zs_blkid) {
360 			break;
361 		} else if (blkid + 1 == zs->zs_blkid) {
362 			blkid++;
363 			nblks--;
364 			break;
365 		}
366 	}
367 
368 	/*
369 	 * If the file is ending, remove the matching stream if found.
370 	 * If not found then it is too late to create a new one now.
371 	 */
372 	uint64_t end_of_access_blkid = blkid + nblks;
373 	if (end_of_access_blkid >= maxblkid) {
374 		if (zs != NULL)
375 			dmu_zfetch_stream_remove(zf, zs);
376 		mutex_exit(&zf->zf_lock);
377 		if (!have_lock)
378 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
379 		return (NULL);
380 	}
381 
382 	/* Exit if we already prefetched this block before. */
383 	if (nblks == 0) {
384 		mutex_exit(&zf->zf_lock);
385 		if (!have_lock)
386 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
387 		return (NULL);
388 	}
389 
390 	if (zs == NULL) {
391 		/*
392 		 * This access is not part of any existing stream.  Create
393 		 * a new stream for it.
394 		 */
395 		dmu_zfetch_stream_create(zf, end_of_access_blkid);
396 		mutex_exit(&zf->zf_lock);
397 		if (!have_lock)
398 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
399 		ZFETCHSTAT_BUMP(zfetchstat_misses);
400 		return (NULL);
401 	}
402 
403 	/*
404 	 * This access was to a block that we issued a prefetch for on
405 	 * behalf of this stream.  Calculate further prefetch distances.
406 	 *
407 	 * Start prefetch from the demand access size (nblks).  Double the
408 	 * distance every access up to zfetch_min_distance.  After that only
409 	 * if needed increase the distance by 1/8 up to zfetch_max_distance.
410 	 */
411 	unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift;
412 	unsigned int pf_nblks;
413 	if (fetch_data) {
414 		if (unlikely(zs->zs_pf_dist < nbytes))
415 			zs->zs_pf_dist = nbytes;
416 		else if (zs->zs_pf_dist < zfetch_min_distance)
417 			zs->zs_pf_dist *= 2;
418 		else if (zs->zs_more)
419 			zs->zs_pf_dist += zs->zs_pf_dist / 8;
420 		zs->zs_more = B_FALSE;
421 		if (zs->zs_pf_dist > zfetch_max_distance)
422 			zs->zs_pf_dist = zfetch_max_distance;
423 		pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift;
424 	} else {
425 		pf_nblks = 0;
426 	}
427 	if (zs->zs_pf_start < end_of_access_blkid)
428 		zs->zs_pf_start = end_of_access_blkid;
429 	if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
430 		zs->zs_pf_end = end_of_access_blkid + pf_nblks;
431 
432 	/*
433 	 * Do the same for indirects, starting where we will stop reading
434 	 * data blocks (and the indirects that point to them).
435 	 */
436 	if (unlikely(zs->zs_ipf_dist < nbytes))
437 		zs->zs_ipf_dist = nbytes;
438 	else
439 		zs->zs_ipf_dist *= 2;
440 	if (zs->zs_ipf_dist > zfetch_max_idistance)
441 		zs->zs_ipf_dist = zfetch_max_idistance;
442 	pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift;
443 	if (zs->zs_ipf_start < zs->zs_pf_end)
444 		zs->zs_ipf_start = zs->zs_pf_end;
445 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
446 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
447 
448 	zs->zs_blkid = end_of_access_blkid;
449 	/* Protect the stream from reclamation. */
450 	zs->zs_atime = gethrtime();
451 	zfs_refcount_add(&zs->zs_refs, NULL);
452 	/* Count concurrent callers. */
453 	zfs_refcount_add(&zs->zs_callers, NULL);
454 	mutex_exit(&zf->zf_lock);
455 
456 	if (!have_lock)
457 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
458 
459 	ZFETCHSTAT_BUMP(zfetchstat_hits);
460 	return (zs);
461 }
462 
463 void
464 dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
465 {
466 	zfetch_t *zf = zs->zs_fetch;
467 	int64_t pf_start, pf_end, ipf_start, ipf_end;
468 	int epbs, issued;
469 
470 	if (missed)
471 		zs->zs_missed = missed;
472 
473 	/*
474 	 * Postpone the prefetch if there are more concurrent callers.
475 	 * It happens when multiple requests are waiting for the same
476 	 * indirect block.  The last one will run the prefetch for all.
477 	 */
478 	if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
479 		/* Drop reference taken in dmu_zfetch_prepare(). */
480 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
481 			dmu_zfetch_stream_fini(zs);
482 		return;
483 	}
484 
485 	mutex_enter(&zf->zf_lock);
486 	if (zs->zs_missed) {
487 		pf_start = zs->zs_pf_start;
488 		pf_end = zs->zs_pf_start = zs->zs_pf_end;
489 	} else {
490 		pf_start = pf_end = 0;
491 	}
492 	ipf_start = zs->zs_ipf_start;
493 	ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
494 	mutex_exit(&zf->zf_lock);
495 	ASSERT3S(pf_start, <=, pf_end);
496 	ASSERT3S(ipf_start, <=, ipf_end);
497 
498 	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
499 	ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
500 	ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
501 	ASSERT3S(ipf_start, <=, ipf_end);
502 	issued = pf_end - pf_start + ipf_end - ipf_start;
503 	if (issued > 1) {
504 		/* More references on top of taken in dmu_zfetch_prepare(). */
505 		for (int i = 0; i < issued - 1; i++)
506 			zfs_refcount_add(&zs->zs_refs, NULL);
507 	} else if (issued == 0) {
508 		/* Some other thread has done our work, so drop the ref. */
509 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
510 			dmu_zfetch_stream_fini(zs);
511 		return;
512 	}
513 
514 	if (!have_lock)
515 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
516 
517 	issued = 0;
518 	for (int64_t blk = pf_start; blk < pf_end; blk++) {
519 		issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
520 		    ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
521 	}
522 	for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
523 		issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
524 		    ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
525 	}
526 
527 	if (!have_lock)
528 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
529 
530 	if (issued)
531 		ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
532 }
533 
534 void
535 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
536     boolean_t missed, boolean_t have_lock)
537 {
538 	zstream_t *zs;
539 
540 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
541 	if (zs)
542 		dmu_zfetch_run(zs, missed, have_lock);
543 }
544 
545 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
546 	"Disable all ZFS prefetching");
547 
548 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
549 	"Max number of streams per zfetch");
550 
551 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
552 	"Min time before stream reclaim");
553 
554 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
555 	"Max time before stream delete");
556 
557 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
558 	"Min bytes to prefetch per stream");
559 
560 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
561 	"Max bytes to prefetch per stream");
562 
563 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
564 	"Max bytes to prefetch indirects for per stream");
565 
566 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW,
567 	"Number of bytes in a array_read");
568