1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
14  * Copyright (c) 2016 by Delphix. All rights reserved.
15  */
16 
17 /*
18  * See abd.c for a general overview of the arc buffered data (ABD).
19  *
20  * Using a large proportion of scattered ABDs decreases ARC fragmentation since
21  * when we are at the limit of allocatable space, using equal-size chunks will
22  * allow us to quickly reclaim enough space for a new large allocation (assuming
23  * it is also scattered).
24  *
25  * ABDs are allocated scattered by default unless the caller uses
26  * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
27  */
28 
29 #include <sys/abd_impl.h>
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/zio.h>
33 #include <sys/zfs_context.h>
34 #include <sys/zfs_znode.h>
35 
36 typedef struct abd_stats {
37 	kstat_named_t abdstat_struct_size;
38 	kstat_named_t abdstat_scatter_cnt;
39 	kstat_named_t abdstat_scatter_data_size;
40 	kstat_named_t abdstat_scatter_chunk_waste;
41 	kstat_named_t abdstat_linear_cnt;
42 	kstat_named_t abdstat_linear_data_size;
43 } abd_stats_t;
44 
45 static abd_stats_t abd_stats = {
46 	/* Amount of memory occupied by all of the abd_t struct allocations */
47 	{ "struct_size",			KSTAT_DATA_UINT64 },
48 	/*
49 	 * The number of scatter ABDs which are currently allocated, excluding
50 	 * ABDs which don't own their data (for instance the ones which were
51 	 * allocated through abd_get_offset()).
52 	 */
53 	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
54 	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
55 	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
56 	/*
57 	 * The amount of space wasted at the end of the last chunk across all
58 	 * scatter ABDs tracked by scatter_cnt.
59 	 */
60 	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
61 	/*
62 	 * The number of linear ABDs which are currently allocated, excluding
63 	 * ABDs which don't own their data (for instance the ones which were
64 	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
65 	 * ABD takes ownership of its buf then it will become tracked.
66 	 */
67 	{ "linear_cnt",				KSTAT_DATA_UINT64 },
68 	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
69 	{ "linear_data_size",			KSTAT_DATA_UINT64 },
70 };
71 
72 struct {
73 	wmsum_t abdstat_struct_size;
74 	wmsum_t abdstat_scatter_cnt;
75 	wmsum_t abdstat_scatter_data_size;
76 	wmsum_t abdstat_scatter_chunk_waste;
77 	wmsum_t abdstat_linear_cnt;
78 	wmsum_t abdstat_linear_data_size;
79 } abd_sums;
80 
81 /*
82  * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
83  * ABD's for.  Smaller allocations will use linear ABD's which use
84  * zio_[data_]buf_alloc().
85  *
86  * Scatter ABD's use at least one page each, so sub-page allocations waste
87  * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
88  * half of each page).  Using linear ABD's for small allocations means that
89  * they will be put on slabs which contain many allocations.
90  *
91  * Linear ABDs for multi-page allocations are easier to use, and in some cases
92  * it allows to avoid buffer copying.  But allocation and especially free
93  * of multi-page linear ABDs are expensive operations due to KVA mapping and
94  * unmapping, and with time they cause KVA fragmentations.
95  */
96 static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
97 
98 #if defined(_KERNEL)
99 SYSCTL_DECL(_vfs_zfs);
100 
101 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
102 	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
103 SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
104 	&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
105 #endif
106 
107 kmem_cache_t *abd_chunk_cache;
108 static kstat_t *abd_ksp;
109 
110 /*
111  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
112  * just a single zero'd page-sized buffer. This allows us to conserve
113  * memory by only using a single zero buffer for the scatter chunks.
114  */
115 abd_t *abd_zero_scatter = NULL;
116 
117 static uint_t
118 abd_chunkcnt_for_bytes(size_t size)
119 {
120 	return ((size + PAGE_MASK) >> PAGE_SHIFT);
121 }
122 
123 static inline uint_t
124 abd_scatter_chunkcnt(abd_t *abd)
125 {
126 	ASSERT(!abd_is_linear(abd));
127 	return (abd_chunkcnt_for_bytes(
128 	    ABD_SCATTER(abd).abd_offset + abd->abd_size));
129 }
130 
131 boolean_t
132 abd_size_alloc_linear(size_t size)
133 {
134 	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
135 }
136 
137 void
138 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
139 {
140 	uint_t n = abd_scatter_chunkcnt(abd);
141 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
142 	int waste = (n << PAGE_SHIFT) - abd->abd_size;
143 	if (op == ABDSTAT_INCR) {
144 		ABDSTAT_BUMP(abdstat_scatter_cnt);
145 		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
146 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
147 		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
148 	} else {
149 		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
150 		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
151 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
152 		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
153 	}
154 }
155 
156 void
157 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
158 {
159 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
160 	if (op == ABDSTAT_INCR) {
161 		ABDSTAT_BUMP(abdstat_linear_cnt);
162 		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
163 	} else {
164 		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
165 		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
166 	}
167 }
168 
169 void
170 abd_verify_scatter(abd_t *abd)
171 {
172 	uint_t i, n;
173 
174 	/*
175 	 * There is no scatter linear pages in FreeBSD so there is
176 	 * an error if the ABD has been marked as a linear page.
177 	 */
178 	ASSERT(!abd_is_linear_page(abd));
179 	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE);
180 	n = abd_scatter_chunkcnt(abd);
181 	for (i = 0; i < n; i++) {
182 		ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
183 	}
184 }
185 
186 void
187 abd_alloc_chunks(abd_t *abd, size_t size)
188 {
189 	uint_t i, n;
190 
191 	n = abd_chunkcnt_for_bytes(size);
192 	for (i = 0; i < n; i++) {
193 		ABD_SCATTER(abd).abd_chunks[i] =
194 		    kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
195 	}
196 }
197 
198 void
199 abd_free_chunks(abd_t *abd)
200 {
201 	uint_t i, n;
202 
203 	n = abd_scatter_chunkcnt(abd);
204 	for (i = 0; i < n; i++) {
205 		kmem_cache_free(abd_chunk_cache,
206 		    ABD_SCATTER(abd).abd_chunks[i]);
207 	}
208 }
209 
210 abd_t *
211 abd_alloc_struct_impl(size_t size)
212 {
213 	uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
214 	/*
215 	 * In the event we are allocating a gang ABD, the size passed in
216 	 * will be 0. We must make sure to set abd_size to the size of an
217 	 * ABD struct as opposed to an ABD scatter with 0 chunks. The gang
218 	 * ABD struct allocation accounts for an additional 24 bytes over
219 	 * a scatter ABD with 0 chunks.
220 	 */
221 	size_t abd_size = MAX(sizeof (abd_t),
222 	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
223 	abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
224 	ASSERT3P(abd, !=, NULL);
225 	ABDSTAT_INCR(abdstat_struct_size, abd_size);
226 
227 	return (abd);
228 }
229 
230 void
231 abd_free_struct_impl(abd_t *abd)
232 {
233 	uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
234 	    abd_scatter_chunkcnt(abd);
235 	ssize_t size = MAX(sizeof (abd_t),
236 	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
237 	kmem_free(abd, size);
238 	ABDSTAT_INCR(abdstat_struct_size, -size);
239 }
240 
241 /*
242  * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where
243  * each chunk in the scatterlist will be set to the same area.
244  */
245 _Static_assert(ZERO_REGION_SIZE >= PAGE_SIZE, "zero_region too small");
246 static void
247 abd_alloc_zero_scatter(void)
248 {
249 	uint_t i, n;
250 
251 	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
252 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
253 	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
254 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
255 
256 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
257 
258 	for (i = 0; i < n; i++) {
259 		ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
260 		    __DECONST(void *, zero_region);
261 	}
262 
263 	ABDSTAT_BUMP(abdstat_scatter_cnt);
264 	ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE);
265 }
266 
267 static void
268 abd_free_zero_scatter(void)
269 {
270 	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
271 	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE);
272 
273 	abd_free_struct(abd_zero_scatter);
274 	abd_zero_scatter = NULL;
275 }
276 
277 static int
278 abd_kstats_update(kstat_t *ksp, int rw)
279 {
280 	abd_stats_t *as = ksp->ks_data;
281 
282 	if (rw == KSTAT_WRITE)
283 		return (EACCES);
284 	as->abdstat_struct_size.value.ui64 =
285 	    wmsum_value(&abd_sums.abdstat_struct_size);
286 	as->abdstat_scatter_cnt.value.ui64 =
287 	    wmsum_value(&abd_sums.abdstat_scatter_cnt);
288 	as->abdstat_scatter_data_size.value.ui64 =
289 	    wmsum_value(&abd_sums.abdstat_scatter_data_size);
290 	as->abdstat_scatter_chunk_waste.value.ui64 =
291 	    wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
292 	as->abdstat_linear_cnt.value.ui64 =
293 	    wmsum_value(&abd_sums.abdstat_linear_cnt);
294 	as->abdstat_linear_data_size.value.ui64 =
295 	    wmsum_value(&abd_sums.abdstat_linear_data_size);
296 	return (0);
297 }
298 
299 void
300 abd_init(void)
301 {
302 	abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0,
303 	    NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
304 
305 	wmsum_init(&abd_sums.abdstat_struct_size, 0);
306 	wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
307 	wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
308 	wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
309 	wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
310 	wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
311 
312 	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
313 	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
314 	if (abd_ksp != NULL) {
315 		abd_ksp->ks_data = &abd_stats;
316 		abd_ksp->ks_update = abd_kstats_update;
317 		kstat_install(abd_ksp);
318 	}
319 
320 	abd_alloc_zero_scatter();
321 }
322 
323 void
324 abd_fini(void)
325 {
326 	abd_free_zero_scatter();
327 
328 	if (abd_ksp != NULL) {
329 		kstat_delete(abd_ksp);
330 		abd_ksp = NULL;
331 	}
332 
333 	wmsum_fini(&abd_sums.abdstat_struct_size);
334 	wmsum_fini(&abd_sums.abdstat_scatter_cnt);
335 	wmsum_fini(&abd_sums.abdstat_scatter_data_size);
336 	wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
337 	wmsum_fini(&abd_sums.abdstat_linear_cnt);
338 	wmsum_fini(&abd_sums.abdstat_linear_data_size);
339 
340 	kmem_cache_destroy(abd_chunk_cache);
341 	abd_chunk_cache = NULL;
342 }
343 
344 void
345 abd_free_linear_page(abd_t *abd)
346 {
347 	/*
348 	 * FreeBSD does not have scatter linear pages
349 	 * so there is an error.
350 	 */
351 	VERIFY(0);
352 }
353 
354 /*
355  * If we're going to use this ABD for doing I/O using the block layer, the
356  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
357  * plan to store this ABD in memory for a long period of time, we should
358  * allocate the ABD type that requires the least data copying to do the I/O.
359  *
360  * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
361  * using a scatter/gather list we should switch to that and replace this call
362  * with vanilla abd_alloc().
363  */
364 abd_t *
365 abd_alloc_for_io(size_t size, boolean_t is_metadata)
366 {
367 	return (abd_alloc_linear(size, is_metadata));
368 }
369 
370 abd_t *
371 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
372     size_t size)
373 {
374 	abd_verify(sabd);
375 	ASSERT3U(off, <=, sabd->abd_size);
376 
377 	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
378 	size_t chunkcnt = abd_chunkcnt_for_bytes(
379 	    (new_offset & PAGE_MASK) + size);
380 
381 	ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd));
382 
383 	/*
384 	 * If an abd struct is provided, it is only the minimum size.  If we
385 	 * need additional chunks, we need to allocate a new struct.
386 	 */
387 	if (abd != NULL &&
388 	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) >
389 	    sizeof (abd_t)) {
390 		abd = NULL;
391 	}
392 
393 	if (abd == NULL)
394 		abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
395 
396 	/*
397 	 * Even if this buf is filesystem metadata, we only track that
398 	 * if we own the underlying data buffer, which is not true in
399 	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
400 	 */
401 
402 	ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK;
403 
404 	/* Copy the scatterlist starting at the correct offset */
405 	(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
406 	    &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT],
407 	    chunkcnt * sizeof (void *));
408 
409 	return (abd);
410 }
411 
412 /*
413  * Initialize the abd_iter.
414  */
415 void
416 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
417 {
418 	ASSERT(!abd_is_gang(abd));
419 	abd_verify(abd);
420 	aiter->iter_abd = abd;
421 	aiter->iter_pos = 0;
422 	aiter->iter_mapaddr = NULL;
423 	aiter->iter_mapsize = 0;
424 }
425 
426 /*
427  * This is just a helper function to see if we have exhausted the
428  * abd_iter and reached the end.
429  */
430 boolean_t
431 abd_iter_at_end(struct abd_iter *aiter)
432 {
433 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
434 }
435 
436 /*
437  * Advance the iterator by a certain amount. Cannot be called when a chunk is
438  * in use. This can be safely called when the aiter has already exhausted, in
439  * which case this does nothing.
440  */
441 void
442 abd_iter_advance(struct abd_iter *aiter, size_t amount)
443 {
444 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
445 	ASSERT0(aiter->iter_mapsize);
446 
447 	/* There's nothing left to advance to, so do nothing */
448 	if (abd_iter_at_end(aiter))
449 		return;
450 
451 	aiter->iter_pos += amount;
452 }
453 
454 /*
455  * Map the current chunk into aiter. This can be safely called when the aiter
456  * has already exhausted, in which case this does nothing.
457  */
458 void
459 abd_iter_map(struct abd_iter *aiter)
460 {
461 	void *paddr;
462 
463 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
464 	ASSERT0(aiter->iter_mapsize);
465 
466 	/* There's nothing left to iterate over, so do nothing */
467 	if (abd_iter_at_end(aiter))
468 		return;
469 
470 	abd_t *abd = aiter->iter_abd;
471 	size_t offset = aiter->iter_pos;
472 	if (abd_is_linear(abd)) {
473 		aiter->iter_mapsize = abd->abd_size - offset;
474 		paddr = ABD_LINEAR_BUF(abd);
475 	} else {
476 		offset += ABD_SCATTER(abd).abd_offset;
477 		paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT];
478 		offset &= PAGE_MASK;
479 		aiter->iter_mapsize = MIN(PAGE_SIZE - offset,
480 		    abd->abd_size - aiter->iter_pos);
481 	}
482 	aiter->iter_mapaddr = (char *)paddr + offset;
483 }
484 
485 /*
486  * Unmap the current chunk from aiter. This can be safely called when the aiter
487  * has already exhausted, in which case this does nothing.
488  */
489 void
490 abd_iter_unmap(struct abd_iter *aiter)
491 {
492 	if (!abd_iter_at_end(aiter)) {
493 		ASSERT3P(aiter->iter_mapaddr, !=, NULL);
494 		ASSERT3U(aiter->iter_mapsize, >, 0);
495 	}
496 
497 	aiter->iter_mapaddr = NULL;
498 	aiter->iter_mapsize = 0;
499 }
500 
501 void
502 abd_cache_reap_now(void)
503 {
504 	kmem_cache_reap_soon(abd_chunk_cache);
505 }
506