1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23eda14cbcSMatt Macy  * Copyright (c) 2019 by Delphix. All rights reserved.
24783d3ff6SMartin Matuska  * Copyright (c) 2023, 2024, Klara Inc.
25eda14cbcSMatt Macy  */
26eda14cbcSMatt Macy 
27eda14cbcSMatt Macy /*
28eda14cbcSMatt Macy  * See abd.c for a general overview of the arc buffered data (ABD).
29eda14cbcSMatt Macy  *
30eda14cbcSMatt Macy  * Linear buffers act exactly like normal buffers and are always mapped into the
31eda14cbcSMatt Macy  * kernel's virtual memory space, while scattered ABD data chunks are allocated
32eda14cbcSMatt Macy  * as physical pages and then mapped in only while they are actually being
33eda14cbcSMatt Macy  * accessed through one of the abd_* library functions. Using scattered ABDs
34eda14cbcSMatt Macy  * provides several benefits:
35eda14cbcSMatt Macy  *
36eda14cbcSMatt Macy  *  (1) They avoid use of kmem_*, preventing performance problems where running
37eda14cbcSMatt Macy  *      kmem_reap on very large memory systems never finishes and causes
38eda14cbcSMatt Macy  *      constant TLB shootdowns.
39eda14cbcSMatt Macy  *
40eda14cbcSMatt Macy  *  (2) Fragmentation is less of an issue since when we are at the limit of
41eda14cbcSMatt Macy  *      allocatable space, we won't have to search around for a long free
42eda14cbcSMatt Macy  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
43eda14cbcSMatt Macy  *      individually, so even if we are using HIGHMEM (see next point) we
44eda14cbcSMatt Macy  *      wouldn't need to worry about finding a contiguous address range.
45eda14cbcSMatt Macy  *
46eda14cbcSMatt Macy  *  (3) If we are not using HIGHMEM, then all physical memory is always
47eda14cbcSMatt Macy  *      mapped into the kernel's address space, so we also avoid the map /
48eda14cbcSMatt Macy  *      unmap costs on each ABD access.
49eda14cbcSMatt Macy  *
50eda14cbcSMatt Macy  * If we are not using HIGHMEM, scattered buffers which have only one chunk
51eda14cbcSMatt Macy  * can be treated as linear buffers, because they are contiguous in the
52eda14cbcSMatt Macy  * kernel's virtual address space.  See abd_alloc_chunks() for details.
53eda14cbcSMatt Macy  */
54eda14cbcSMatt Macy 
55eda14cbcSMatt Macy #include <sys/abd_impl.h>
56eda14cbcSMatt Macy #include <sys/param.h>
57eda14cbcSMatt Macy #include <sys/zio.h>
58eda14cbcSMatt Macy #include <sys/arc.h>
59eda14cbcSMatt Macy #include <sys/zfs_context.h>
60eda14cbcSMatt Macy #include <sys/zfs_znode.h>
61eda14cbcSMatt Macy #ifdef _KERNEL
62eda14cbcSMatt Macy #include <linux/kmap_compat.h>
63783d3ff6SMartin Matuska #include <linux/mm_compat.h>
64eda14cbcSMatt Macy #include <linux/scatterlist.h>
65783d3ff6SMartin Matuska #include <linux/version.h>
66fd45b686SMartin Matuska #endif
67fd45b686SMartin Matuska 
68fd45b686SMartin Matuska #ifdef _KERNEL
69fd45b686SMartin Matuska #if defined(MAX_ORDER)
70fd45b686SMartin Matuska #define	ABD_MAX_ORDER	(MAX_ORDER)
71fd45b686SMartin Matuska #elif defined(MAX_PAGE_ORDER)
72fd45b686SMartin Matuska #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
73fd45b686SMartin Matuska #endif
74eda14cbcSMatt Macy #else
75fd45b686SMartin Matuska #define	ABD_MAX_ORDER	(1)
76eda14cbcSMatt Macy #endif
77eda14cbcSMatt Macy 
78eda14cbcSMatt Macy typedef struct abd_stats {
79eda14cbcSMatt Macy 	kstat_named_t abdstat_struct_size;
80eda14cbcSMatt Macy 	kstat_named_t abdstat_linear_cnt;
81eda14cbcSMatt Macy 	kstat_named_t abdstat_linear_data_size;
82eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_cnt;
83eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_data_size;
84eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_chunk_waste;
85fd45b686SMartin Matuska 	kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER];
86eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_page_multi_chunk;
87eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_page_multi_zone;
88eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_page_alloc_retry;
89eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_sg_table_retry;
90eda14cbcSMatt Macy } abd_stats_t;
91eda14cbcSMatt Macy 
92eda14cbcSMatt Macy static abd_stats_t abd_stats = {
93eda14cbcSMatt Macy 	/* Amount of memory occupied by all of the abd_t struct allocations */
94eda14cbcSMatt Macy 	{ "struct_size",			KSTAT_DATA_UINT64 },
95eda14cbcSMatt Macy 	/*
96eda14cbcSMatt Macy 	 * The number of linear ABDs which are currently allocated, excluding
97eda14cbcSMatt Macy 	 * ABDs which don't own their data (for instance the ones which were
98eda14cbcSMatt Macy 	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
99eda14cbcSMatt Macy 	 * ABD takes ownership of its buf then it will become tracked.
100eda14cbcSMatt Macy 	 */
101eda14cbcSMatt Macy 	{ "linear_cnt",				KSTAT_DATA_UINT64 },
102eda14cbcSMatt Macy 	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
103eda14cbcSMatt Macy 	{ "linear_data_size",			KSTAT_DATA_UINT64 },
104eda14cbcSMatt Macy 	/*
105eda14cbcSMatt Macy 	 * The number of scatter ABDs which are currently allocated, excluding
106eda14cbcSMatt Macy 	 * ABDs which don't own their data (for instance the ones which were
107eda14cbcSMatt Macy 	 * allocated through abd_get_offset()).
108eda14cbcSMatt Macy 	 */
109eda14cbcSMatt Macy 	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
110eda14cbcSMatt Macy 	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
111eda14cbcSMatt Macy 	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
112eda14cbcSMatt Macy 	/*
113eda14cbcSMatt Macy 	 * The amount of space wasted at the end of the last chunk across all
114eda14cbcSMatt Macy 	 * scatter ABDs tracked by scatter_cnt.
115eda14cbcSMatt Macy 	 */
116eda14cbcSMatt Macy 	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
117eda14cbcSMatt Macy 	/*
118eda14cbcSMatt Macy 	 * The number of compound allocations of a given order.  These
119eda14cbcSMatt Macy 	 * allocations are spread over all currently allocated ABDs, and
120eda14cbcSMatt Macy 	 * act as a measure of memory fragmentation.
121eda14cbcSMatt Macy 	 */
122eda14cbcSMatt Macy 	{ { "scatter_order_N",			KSTAT_DATA_UINT64 } },
123eda14cbcSMatt Macy 	/*
124eda14cbcSMatt Macy 	 * The number of scatter ABDs which contain multiple chunks.
125eda14cbcSMatt Macy 	 * ABDs are preferentially allocated from the minimum number of
126eda14cbcSMatt Macy 	 * contiguous multi-page chunks, a single chunk is optimal.
127eda14cbcSMatt Macy 	 */
128eda14cbcSMatt Macy 	{ "scatter_page_multi_chunk",		KSTAT_DATA_UINT64 },
129eda14cbcSMatt Macy 	/*
130eda14cbcSMatt Macy 	 * The number of scatter ABDs which are split across memory zones.
131eda14cbcSMatt Macy 	 * ABDs are preferentially allocated using pages from a single zone.
132eda14cbcSMatt Macy 	 */
133eda14cbcSMatt Macy 	{ "scatter_page_multi_zone",		KSTAT_DATA_UINT64 },
134eda14cbcSMatt Macy 	/*
135eda14cbcSMatt Macy 	 *  The total number of retries encountered when attempting to
136eda14cbcSMatt Macy 	 *  allocate the pages to populate the scatter ABD.
137eda14cbcSMatt Macy 	 */
138eda14cbcSMatt Macy 	{ "scatter_page_alloc_retry",		KSTAT_DATA_UINT64 },
139eda14cbcSMatt Macy 	/*
140eda14cbcSMatt Macy 	 *  The total number of retries encountered when attempting to
141eda14cbcSMatt Macy 	 *  allocate the sg table for an ABD.
142eda14cbcSMatt Macy 	 */
143eda14cbcSMatt Macy 	{ "scatter_sg_table_retry",		KSTAT_DATA_UINT64 },
144eda14cbcSMatt Macy };
145eda14cbcSMatt Macy 
146dbd5678dSMartin Matuska static struct {
1470d8fe237SMartin Matuska 	wmsum_t abdstat_struct_size;
1480d8fe237SMartin Matuska 	wmsum_t abdstat_linear_cnt;
1490d8fe237SMartin Matuska 	wmsum_t abdstat_linear_data_size;
1500d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_cnt;
1510d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_data_size;
1520d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_chunk_waste;
153fd45b686SMartin Matuska 	wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER];
1540d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_page_multi_chunk;
1550d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_page_multi_zone;
1560d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_page_alloc_retry;
1570d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_sg_table_retry;
1580d8fe237SMartin Matuska } abd_sums;
1590d8fe237SMartin Matuska 
160eda14cbcSMatt Macy #define	abd_for_each_sg(abd, sg, n, i)	\
161eda14cbcSMatt Macy 	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
162eda14cbcSMatt Macy 
163eda14cbcSMatt Macy /*
164eda14cbcSMatt Macy  * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
165eda14cbcSMatt Macy  * ABD's.  Smaller allocations will use linear ABD's which uses
166eda14cbcSMatt Macy  * zio_[data_]buf_alloc().
167eda14cbcSMatt Macy  *
168eda14cbcSMatt Macy  * Scatter ABD's use at least one page each, so sub-page allocations waste
169eda14cbcSMatt Macy  * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
170eda14cbcSMatt Macy  * half of each page).  Using linear ABD's for small allocations means that
171eda14cbcSMatt Macy  * they will be put on slabs which contain many allocations.  This can
172eda14cbcSMatt Macy  * improve memory efficiency, but it also makes it much harder for ARC
173eda14cbcSMatt Macy  * evictions to actually free pages, because all the buffers on one slab need
174eda14cbcSMatt Macy  * to be freed in order for the slab (and underlying pages) to be freed.
175eda14cbcSMatt Macy  * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
176eda14cbcSMatt Macy  * possible for them to actually waste more memory than scatter (one page per
177eda14cbcSMatt Macy  * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
178eda14cbcSMatt Macy  *
179eda14cbcSMatt Macy  * Spill blocks are typically 512B and are heavily used on systems running
180eda14cbcSMatt Macy  * selinux with the default dnode size and the `xattr=sa` property set.
181eda14cbcSMatt Macy  *
182eda14cbcSMatt Macy  * By default we use linear allocations for 512B and 1KB, and scatter
183eda14cbcSMatt Macy  * allocations for larger (1.5KB and up).
184eda14cbcSMatt Macy  */
185e92ffd9bSMartin Matuska static int zfs_abd_scatter_min_size = 512 * 3;
186eda14cbcSMatt Macy 
187eda14cbcSMatt Macy /*
188eda14cbcSMatt Macy  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
189eda14cbcSMatt Macy  * just a single zero'd page. This allows us to conserve memory by
190eda14cbcSMatt Macy  * only using a single zero page for the scatterlist.
191eda14cbcSMatt Macy  */
192eda14cbcSMatt Macy abd_t *abd_zero_scatter = NULL;
193eda14cbcSMatt Macy 
194eda14cbcSMatt Macy struct page;
195eda14cbcSMatt Macy /*
196da5137abSMartin Matuska  * _KERNEL   - Will point to ZERO_PAGE if it is available or it will be
197da5137abSMartin Matuska  *             an allocated zero'd PAGESIZE buffer.
198da5137abSMartin Matuska  * Userspace - Will be an allocated zero'ed PAGESIZE buffer.
199da5137abSMartin Matuska  *
200da5137abSMartin Matuska  * abd_zero_page is assigned to each of the pages of abd_zero_scatter.
201eda14cbcSMatt Macy  */
202eda14cbcSMatt Macy static struct page *abd_zero_page = NULL;
203eda14cbcSMatt Macy 
204eda14cbcSMatt Macy static kmem_cache_t *abd_cache = NULL;
205eda14cbcSMatt Macy static kstat_t *abd_ksp;
206eda14cbcSMatt Macy 
2077877fdebSMatt Macy static uint_t
abd_chunkcnt_for_bytes(size_t size)208eda14cbcSMatt Macy abd_chunkcnt_for_bytes(size_t size)
209eda14cbcSMatt Macy {
210eda14cbcSMatt Macy 	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
211eda14cbcSMatt Macy }
212eda14cbcSMatt Macy 
213eda14cbcSMatt Macy abd_t *
abd_alloc_struct_impl(size_t size)214184c1b94SMartin Matuska abd_alloc_struct_impl(size_t size)
215eda14cbcSMatt Macy {
216eda14cbcSMatt Macy 	/*
217eda14cbcSMatt Macy 	 * In Linux we do not use the size passed in during ABD
218eda14cbcSMatt Macy 	 * allocation, so we just ignore it.
219eda14cbcSMatt Macy 	 */
220e92ffd9bSMartin Matuska 	(void) size;
221eda14cbcSMatt Macy 	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
222eda14cbcSMatt Macy 	ASSERT3P(abd, !=, NULL);
223eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
224eda14cbcSMatt Macy 
225eda14cbcSMatt Macy 	return (abd);
226eda14cbcSMatt Macy }
227eda14cbcSMatt Macy 
228eda14cbcSMatt Macy void
abd_free_struct_impl(abd_t * abd)229184c1b94SMartin Matuska abd_free_struct_impl(abd_t *abd)
230eda14cbcSMatt Macy {
231eda14cbcSMatt Macy 	kmem_cache_free(abd_cache, abd);
232eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
233eda14cbcSMatt Macy }
234eda14cbcSMatt Macy 
235eda14cbcSMatt Macy #ifdef _KERNEL
236fd45b686SMartin Matuska static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
237e92ffd9bSMartin Matuska 
238eda14cbcSMatt Macy /*
239eda14cbcSMatt Macy  * Mark zfs data pages so they can be excluded from kernel crash dumps
240eda14cbcSMatt Macy  */
241eda14cbcSMatt Macy #ifdef _LP64
242eda14cbcSMatt Macy #define	ABD_FILE_CACHE_PAGE	0x2F5ABDF11ECAC4E
243eda14cbcSMatt Macy 
244eda14cbcSMatt Macy static inline void
abd_mark_zfs_page(struct page * page)245eda14cbcSMatt Macy abd_mark_zfs_page(struct page *page)
246eda14cbcSMatt Macy {
247eda14cbcSMatt Macy 	get_page(page);
248eda14cbcSMatt Macy 	SetPagePrivate(page);
249eda14cbcSMatt Macy 	set_page_private(page, ABD_FILE_CACHE_PAGE);
250eda14cbcSMatt Macy }
251eda14cbcSMatt Macy 
252eda14cbcSMatt Macy static inline void
abd_unmark_zfs_page(struct page * page)253eda14cbcSMatt Macy abd_unmark_zfs_page(struct page *page)
254eda14cbcSMatt Macy {
255eda14cbcSMatt Macy 	set_page_private(page, 0UL);
256eda14cbcSMatt Macy 	ClearPagePrivate(page);
257eda14cbcSMatt Macy 	put_page(page);
258eda14cbcSMatt Macy }
259eda14cbcSMatt Macy #else
260eda14cbcSMatt Macy #define	abd_mark_zfs_page(page)
261eda14cbcSMatt Macy #define	abd_unmark_zfs_page(page)
262eda14cbcSMatt Macy #endif /* _LP64 */
263eda14cbcSMatt Macy 
264eda14cbcSMatt Macy #ifndef CONFIG_HIGHMEM
265eda14cbcSMatt Macy 
266eda14cbcSMatt Macy #ifndef __GFP_RECLAIM
267eda14cbcSMatt Macy #define	__GFP_RECLAIM		__GFP_WAIT
268eda14cbcSMatt Macy #endif
269eda14cbcSMatt Macy 
270eda14cbcSMatt Macy /*
271eda14cbcSMatt Macy  * The goal is to minimize fragmentation by preferentially populating ABDs
272eda14cbcSMatt Macy  * with higher order compound pages from a single zone.  Allocation size is
273eda14cbcSMatt Macy  * progressively decreased until it can be satisfied without performing
274eda14cbcSMatt Macy  * reclaim or compaction.  When necessary this function will degenerate to
275eda14cbcSMatt Macy  * allocating individual pages and allowing reclaim to satisfy allocations.
276eda14cbcSMatt Macy  */
277eda14cbcSMatt Macy void
abd_alloc_chunks(abd_t * abd,size_t size)278eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size)
279eda14cbcSMatt Macy {
280eda14cbcSMatt Macy 	struct list_head pages;
281eda14cbcSMatt Macy 	struct sg_table table;
282eda14cbcSMatt Macy 	struct scatterlist *sg;
283eda14cbcSMatt Macy 	struct page *page, *tmp_page = NULL;
284eda14cbcSMatt Macy 	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
285eda14cbcSMatt Macy 	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
286fd45b686SMartin Matuska 	unsigned int max_order = MIN(zfs_abd_scatter_max_order,
287fd45b686SMartin Matuska 	    ABD_MAX_ORDER - 1);
288c9539b89SMartin Matuska 	unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
289c9539b89SMartin Matuska 	unsigned int chunks = 0, zones = 0;
290eda14cbcSMatt Macy 	size_t remaining_size;
291eda14cbcSMatt Macy 	int nid = NUMA_NO_NODE;
292c9539b89SMartin Matuska 	unsigned int alloc_pages = 0;
293eda14cbcSMatt Macy 
294eda14cbcSMatt Macy 	INIT_LIST_HEAD(&pages);
295eda14cbcSMatt Macy 
296c9539b89SMartin Matuska 	ASSERT3U(alloc_pages, <, nr_pages);
297c9539b89SMartin Matuska 
298eda14cbcSMatt Macy 	while (alloc_pages < nr_pages) {
299c9539b89SMartin Matuska 		unsigned int chunk_pages;
300c9539b89SMartin Matuska 		unsigned int order;
301eda14cbcSMatt Macy 
302eda14cbcSMatt Macy 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
303eda14cbcSMatt Macy 		chunk_pages = (1U << order);
304eda14cbcSMatt Macy 
305eda14cbcSMatt Macy 		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
306eda14cbcSMatt Macy 		if (page == NULL) {
307eda14cbcSMatt Macy 			if (order == 0) {
308eda14cbcSMatt Macy 				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
309eda14cbcSMatt Macy 				schedule_timeout_interruptible(1);
310eda14cbcSMatt Macy 			} else {
311eda14cbcSMatt Macy 				max_order = MAX(0, order - 1);
312eda14cbcSMatt Macy 			}
313eda14cbcSMatt Macy 			continue;
314eda14cbcSMatt Macy 		}
315eda14cbcSMatt Macy 
316eda14cbcSMatt Macy 		list_add_tail(&page->lru, &pages);
317eda14cbcSMatt Macy 
318eda14cbcSMatt Macy 		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
319eda14cbcSMatt Macy 			zones++;
320eda14cbcSMatt Macy 
321eda14cbcSMatt Macy 		nid = page_to_nid(page);
322eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_orders[order]);
323eda14cbcSMatt Macy 		chunks++;
324eda14cbcSMatt Macy 		alloc_pages += chunk_pages;
325eda14cbcSMatt Macy 	}
326eda14cbcSMatt Macy 
327eda14cbcSMatt Macy 	ASSERT3S(alloc_pages, ==, nr_pages);
328eda14cbcSMatt Macy 
329eda14cbcSMatt Macy 	while (sg_alloc_table(&table, chunks, gfp)) {
330eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
331eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
332eda14cbcSMatt Macy 	}
333eda14cbcSMatt Macy 
334eda14cbcSMatt Macy 	sg = table.sgl;
335eda14cbcSMatt Macy 	remaining_size = size;
336eda14cbcSMatt Macy 	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
337eda14cbcSMatt Macy 		size_t sg_size = MIN(PAGESIZE << compound_order(page),
338eda14cbcSMatt Macy 		    remaining_size);
339eda14cbcSMatt Macy 		sg_set_page(sg, page, sg_size, 0);
340eda14cbcSMatt Macy 		abd_mark_zfs_page(page);
341eda14cbcSMatt Macy 		remaining_size -= sg_size;
342eda14cbcSMatt Macy 
343eda14cbcSMatt Macy 		sg = sg_next(sg);
344eda14cbcSMatt Macy 		list_del(&page->lru);
345eda14cbcSMatt Macy 	}
346eda14cbcSMatt Macy 
347eda14cbcSMatt Macy 	/*
348eda14cbcSMatt Macy 	 * These conditions ensure that a possible transformation to a linear
349eda14cbcSMatt Macy 	 * ABD would be valid.
350eda14cbcSMatt Macy 	 */
351eda14cbcSMatt Macy 	ASSERT(!PageHighMem(sg_page(table.sgl)));
352eda14cbcSMatt Macy 	ASSERT0(ABD_SCATTER(abd).abd_offset);
353eda14cbcSMatt Macy 
354eda14cbcSMatt Macy 	if (table.nents == 1) {
355eda14cbcSMatt Macy 		/*
356eda14cbcSMatt Macy 		 * Since there is only one entry, this ABD can be represented
357eda14cbcSMatt Macy 		 * as a linear buffer.  All single-page (4K) ABD's can be
358eda14cbcSMatt Macy 		 * represented this way.  Some multi-page ABD's can also be
359eda14cbcSMatt Macy 		 * represented this way, if we were able to allocate a single
360eda14cbcSMatt Macy 		 * "chunk" (higher-order "page" which represents a power-of-2
361eda14cbcSMatt Macy 		 * series of physically-contiguous pages).  This is often the
362eda14cbcSMatt Macy 		 * case for 2-page (8K) ABD's.
363eda14cbcSMatt Macy 		 *
364eda14cbcSMatt Macy 		 * Representing a single-entry scatter ABD as a linear ABD
365eda14cbcSMatt Macy 		 * has the performance advantage of avoiding the copy (and
366eda14cbcSMatt Macy 		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
367eda14cbcSMatt Macy 		 * A performance increase of around 5% has been observed for
368eda14cbcSMatt Macy 		 * ARC-cached reads (of small blocks which can take advantage
369eda14cbcSMatt Macy 		 * of this).
370eda14cbcSMatt Macy 		 *
371eda14cbcSMatt Macy 		 * Note that this optimization is only possible because the
372eda14cbcSMatt Macy 		 * pages are always mapped into the kernel's address space.
373eda14cbcSMatt Macy 		 * This is not the case for highmem pages, so the
374eda14cbcSMatt Macy 		 * optimization can not be made there.
375eda14cbcSMatt Macy 		 */
376eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_LINEAR;
377eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
378eda14cbcSMatt Macy 		abd->abd_u.abd_linear.abd_sgl = table.sgl;
379eda14cbcSMatt Macy 		ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
380eda14cbcSMatt Macy 	} else if (table.nents > 1) {
381eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
382eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
383eda14cbcSMatt Macy 
384eda14cbcSMatt Macy 		if (zones) {
385eda14cbcSMatt Macy 			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
386eda14cbcSMatt Macy 			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
387eda14cbcSMatt Macy 		}
388eda14cbcSMatt Macy 
389eda14cbcSMatt Macy 		ABD_SCATTER(abd).abd_sgl = table.sgl;
390eda14cbcSMatt Macy 		ABD_SCATTER(abd).abd_nents = table.nents;
391eda14cbcSMatt Macy 	}
392eda14cbcSMatt Macy }
393eda14cbcSMatt Macy #else
394eda14cbcSMatt Macy 
395eda14cbcSMatt Macy /*
396eda14cbcSMatt Macy  * Allocate N individual pages to construct a scatter ABD.  This function
397eda14cbcSMatt Macy  * makes no attempt to request contiguous pages and requires the minimal
398eda14cbcSMatt Macy  * number of kernel interfaces.  It's designed for maximum compatibility.
399eda14cbcSMatt Macy  */
400eda14cbcSMatt Macy void
abd_alloc_chunks(abd_t * abd,size_t size)401eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size)
402eda14cbcSMatt Macy {
403eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
404eda14cbcSMatt Macy 	struct sg_table table;
405eda14cbcSMatt Macy 	struct page *page;
406eda14cbcSMatt Macy 	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
407eda14cbcSMatt Macy 	int nr_pages = abd_chunkcnt_for_bytes(size);
408eda14cbcSMatt Macy 	int i = 0;
409eda14cbcSMatt Macy 
410eda14cbcSMatt Macy 	while (sg_alloc_table(&table, nr_pages, gfp)) {
411eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
412eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
413eda14cbcSMatt Macy 	}
414eda14cbcSMatt Macy 
415eda14cbcSMatt Macy 	ASSERT3U(table.nents, ==, nr_pages);
416eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = table.sgl;
417eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = nr_pages;
418eda14cbcSMatt Macy 
419eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, nr_pages, i) {
420eda14cbcSMatt Macy 		while ((page = __page_cache_alloc(gfp)) == NULL) {
421eda14cbcSMatt Macy 			ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
422eda14cbcSMatt Macy 			schedule_timeout_interruptible(1);
423eda14cbcSMatt Macy 		}
424eda14cbcSMatt Macy 
425eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_orders[0]);
426eda14cbcSMatt Macy 		sg_set_page(sg, page, PAGESIZE, 0);
427eda14cbcSMatt Macy 		abd_mark_zfs_page(page);
428eda14cbcSMatt Macy 	}
429eda14cbcSMatt Macy 
430eda14cbcSMatt Macy 	if (nr_pages > 1) {
431eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
432eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
433eda14cbcSMatt Macy 	}
434eda14cbcSMatt Macy }
435eda14cbcSMatt Macy #endif /* !CONFIG_HIGHMEM */
436eda14cbcSMatt Macy 
437eda14cbcSMatt Macy /*
438eda14cbcSMatt Macy  * This must be called if any of the sg_table allocation functions
439eda14cbcSMatt Macy  * are called.
440eda14cbcSMatt Macy  */
441eda14cbcSMatt Macy static void
abd_free_sg_table(abd_t * abd)442eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd)
443eda14cbcSMatt Macy {
444eda14cbcSMatt Macy 	struct sg_table table;
445eda14cbcSMatt Macy 
446eda14cbcSMatt Macy 	table.sgl = ABD_SCATTER(abd).abd_sgl;
447eda14cbcSMatt Macy 	table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
448eda14cbcSMatt Macy 	sg_free_table(&table);
449eda14cbcSMatt Macy }
450eda14cbcSMatt Macy 
451eda14cbcSMatt Macy void
abd_free_chunks(abd_t * abd)452eda14cbcSMatt Macy abd_free_chunks(abd_t *abd)
453eda14cbcSMatt Macy {
454eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
455eda14cbcSMatt Macy 	struct page *page;
456eda14cbcSMatt Macy 	int nr_pages = ABD_SCATTER(abd).abd_nents;
457eda14cbcSMatt Macy 	int order, i = 0;
458eda14cbcSMatt Macy 
459eda14cbcSMatt Macy 	if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
460eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
461eda14cbcSMatt Macy 
462eda14cbcSMatt Macy 	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
463eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
464eda14cbcSMatt Macy 
465eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, nr_pages, i) {
466eda14cbcSMatt Macy 		page = sg_page(sg);
467eda14cbcSMatt Macy 		abd_unmark_zfs_page(page);
468eda14cbcSMatt Macy 		order = compound_order(page);
469eda14cbcSMatt Macy 		__free_pages(page, order);
470eda14cbcSMatt Macy 		ASSERT3U(sg->length, <=, PAGE_SIZE << order);
471eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
472eda14cbcSMatt Macy 	}
473eda14cbcSMatt Macy 	abd_free_sg_table(abd);
474eda14cbcSMatt Macy }
475eda14cbcSMatt Macy 
476eda14cbcSMatt Macy /*
477eda14cbcSMatt Macy  * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
478eda14cbcSMatt Macy  * the scatterlist will be set to the zero'd out buffer abd_zero_page.
479eda14cbcSMatt Macy  */
480eda14cbcSMatt Macy static void
abd_alloc_zero_scatter(void)481eda14cbcSMatt Macy abd_alloc_zero_scatter(void)
482eda14cbcSMatt Macy {
483eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
484eda14cbcSMatt Macy 	struct sg_table table;
485eda14cbcSMatt Macy 	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
486eda14cbcSMatt Macy 	int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
487eda14cbcSMatt Macy 	int i = 0;
488eda14cbcSMatt Macy 
489da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
490da5137abSMartin Matuska 	gfp_t gfp_zero_page = gfp | __GFP_ZERO;
491eda14cbcSMatt Macy 	while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
492eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
493eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
494eda14cbcSMatt Macy 	}
495eda14cbcSMatt Macy 	abd_mark_zfs_page(abd_zero_page);
496da5137abSMartin Matuska #else
497da5137abSMartin Matuska 	abd_zero_page = ZERO_PAGE(0);
498da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
499eda14cbcSMatt Macy 
500eda14cbcSMatt Macy 	while (sg_alloc_table(&table, nr_pages, gfp)) {
501eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
502eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
503eda14cbcSMatt Macy 	}
504eda14cbcSMatt Macy 	ASSERT3U(table.nents, ==, nr_pages);
505eda14cbcSMatt Macy 
506eda14cbcSMatt Macy 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
507184c1b94SMartin Matuska 	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
508eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
509eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
510eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
511eda14cbcSMatt Macy 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
512eda14cbcSMatt Macy 	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
513eda14cbcSMatt Macy 
514eda14cbcSMatt Macy 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
515eda14cbcSMatt Macy 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
516eda14cbcSMatt Macy 	}
517eda14cbcSMatt Macy 
518eda14cbcSMatt Macy 	ABDSTAT_BUMP(abdstat_scatter_cnt);
519eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
520eda14cbcSMatt Macy 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
521eda14cbcSMatt Macy }
522eda14cbcSMatt Macy 
523eda14cbcSMatt Macy #else /* _KERNEL */
524eda14cbcSMatt Macy 
525eda14cbcSMatt Macy #ifndef PAGE_SHIFT
526eda14cbcSMatt Macy #define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
527eda14cbcSMatt Macy #endif
528eda14cbcSMatt Macy 
529f9693befSMartin Matuska #define	zfs_kmap_atomic(chunk)		((void *)chunk)
530f9693befSMartin Matuska #define	zfs_kunmap_atomic(addr)		do { (void)(addr); } while (0)
531eda14cbcSMatt Macy #define	local_irq_save(flags)		do { (void)(flags); } while (0)
532eda14cbcSMatt Macy #define	local_irq_restore(flags)	do { (void)(flags); } while (0)
533eda14cbcSMatt Macy #define	nth_page(pg, i) \
534eda14cbcSMatt Macy 	((struct page *)((void *)(pg) + (i) * PAGESIZE))
535eda14cbcSMatt Macy 
536eda14cbcSMatt Macy struct scatterlist {
537eda14cbcSMatt Macy 	struct page *page;
538eda14cbcSMatt Macy 	int length;
539eda14cbcSMatt Macy 	int end;
540eda14cbcSMatt Macy };
541eda14cbcSMatt Macy 
542eda14cbcSMatt Macy static void
sg_init_table(struct scatterlist * sg,int nr)543eda14cbcSMatt Macy sg_init_table(struct scatterlist *sg, int nr)
544eda14cbcSMatt Macy {
545eda14cbcSMatt Macy 	memset(sg, 0, nr * sizeof (struct scatterlist));
546eda14cbcSMatt Macy 	sg[nr - 1].end = 1;
547eda14cbcSMatt Macy }
548eda14cbcSMatt Macy 
549eda14cbcSMatt Macy /*
550eda14cbcSMatt Macy  * This must be called if any of the sg_table allocation functions
551eda14cbcSMatt Macy  * are called.
552eda14cbcSMatt Macy  */
553eda14cbcSMatt Macy static void
abd_free_sg_table(abd_t * abd)554eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd)
555eda14cbcSMatt Macy {
556eda14cbcSMatt Macy 	int nents = ABD_SCATTER(abd).abd_nents;
557eda14cbcSMatt Macy 	vmem_free(ABD_SCATTER(abd).abd_sgl,
558eda14cbcSMatt Macy 	    nents * sizeof (struct scatterlist));
559eda14cbcSMatt Macy }
560eda14cbcSMatt Macy 
561eda14cbcSMatt Macy #define	for_each_sg(sgl, sg, nr, i)	\
562eda14cbcSMatt Macy 	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
563eda14cbcSMatt Macy 
564eda14cbcSMatt Macy static inline void
sg_set_page(struct scatterlist * sg,struct page * page,unsigned int len,unsigned int offset)565eda14cbcSMatt Macy sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
566eda14cbcSMatt Macy     unsigned int offset)
567eda14cbcSMatt Macy {
568eda14cbcSMatt Macy 	/* currently we don't use offset */
569eda14cbcSMatt Macy 	ASSERT(offset == 0);
570eda14cbcSMatt Macy 	sg->page = page;
571eda14cbcSMatt Macy 	sg->length = len;
572eda14cbcSMatt Macy }
573eda14cbcSMatt Macy 
574eda14cbcSMatt Macy static inline struct page *
sg_page(struct scatterlist * sg)575eda14cbcSMatt Macy sg_page(struct scatterlist *sg)
576eda14cbcSMatt Macy {
577eda14cbcSMatt Macy 	return (sg->page);
578eda14cbcSMatt Macy }
579eda14cbcSMatt Macy 
580eda14cbcSMatt Macy static inline struct scatterlist *
sg_next(struct scatterlist * sg)581eda14cbcSMatt Macy sg_next(struct scatterlist *sg)
582eda14cbcSMatt Macy {
583eda14cbcSMatt Macy 	if (sg->end)
584eda14cbcSMatt Macy 		return (NULL);
585eda14cbcSMatt Macy 
586eda14cbcSMatt Macy 	return (sg + 1);
587eda14cbcSMatt Macy }
588eda14cbcSMatt Macy 
589eda14cbcSMatt Macy void
abd_alloc_chunks(abd_t * abd,size_t size)590eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size)
591eda14cbcSMatt Macy {
592eda14cbcSMatt Macy 	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
593eda14cbcSMatt Macy 	struct scatterlist *sg;
594eda14cbcSMatt Macy 	int i;
595eda14cbcSMatt Macy 
596eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
597eda14cbcSMatt Macy 	    sizeof (struct scatterlist), KM_SLEEP);
598eda14cbcSMatt Macy 	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
599eda14cbcSMatt Macy 
600eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, nr_pages, i) {
601eda14cbcSMatt Macy 		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
602eda14cbcSMatt Macy 		sg_set_page(sg, p, PAGESIZE, 0);
603eda14cbcSMatt Macy 	}
604eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = nr_pages;
605eda14cbcSMatt Macy }
606eda14cbcSMatt Macy 
607eda14cbcSMatt Macy void
abd_free_chunks(abd_t * abd)608eda14cbcSMatt Macy abd_free_chunks(abd_t *abd)
609eda14cbcSMatt Macy {
610eda14cbcSMatt Macy 	int i, n = ABD_SCATTER(abd).abd_nents;
611eda14cbcSMatt Macy 	struct scatterlist *sg;
612eda14cbcSMatt Macy 
613eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, n, i) {
614dbd5678dSMartin Matuska 		struct page *p = nth_page(sg_page(sg), 0);
615dbd5678dSMartin Matuska 		umem_free_aligned(p, PAGESIZE);
616eda14cbcSMatt Macy 	}
617eda14cbcSMatt Macy 	abd_free_sg_table(abd);
618eda14cbcSMatt Macy }
619eda14cbcSMatt Macy 
620eda14cbcSMatt Macy static void
abd_alloc_zero_scatter(void)621eda14cbcSMatt Macy abd_alloc_zero_scatter(void)
622eda14cbcSMatt Macy {
623eda14cbcSMatt Macy 	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
624eda14cbcSMatt Macy 	struct scatterlist *sg;
625eda14cbcSMatt Macy 	int i;
626eda14cbcSMatt Macy 
627eda14cbcSMatt Macy 	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
628eda14cbcSMatt Macy 	memset(abd_zero_page, 0, PAGESIZE);
629eda14cbcSMatt Macy 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
630184c1b94SMartin Matuska 	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
631eda14cbcSMatt Macy 	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
632eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
633eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
634eda14cbcSMatt Macy 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
635eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
636eda14cbcSMatt Macy 	    sizeof (struct scatterlist), KM_SLEEP);
637eda14cbcSMatt Macy 
638eda14cbcSMatt Macy 	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
639eda14cbcSMatt Macy 
640eda14cbcSMatt Macy 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
641eda14cbcSMatt Macy 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
642eda14cbcSMatt Macy 	}
643eda14cbcSMatt Macy 
644eda14cbcSMatt Macy 	ABDSTAT_BUMP(abdstat_scatter_cnt);
645eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
646eda14cbcSMatt Macy 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
647eda14cbcSMatt Macy }
648eda14cbcSMatt Macy 
649eda14cbcSMatt Macy #endif /* _KERNEL */
650eda14cbcSMatt Macy 
651eda14cbcSMatt Macy boolean_t
abd_size_alloc_linear(size_t size)652eda14cbcSMatt Macy abd_size_alloc_linear(size_t size)
653eda14cbcSMatt Macy {
6541f88aa09SMartin Matuska 	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
655eda14cbcSMatt Macy }
656eda14cbcSMatt Macy 
657eda14cbcSMatt Macy void
abd_update_scatter_stats(abd_t * abd,abd_stats_op_t op)658eda14cbcSMatt Macy abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
659eda14cbcSMatt Macy {
660eda14cbcSMatt Macy 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
661eda14cbcSMatt Macy 	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
662eda14cbcSMatt Macy 	if (op == ABDSTAT_INCR) {
663eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_cnt);
664eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
665eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
666eda14cbcSMatt Macy 		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
667eda14cbcSMatt Macy 	} else {
668eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
669eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
670eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
671eda14cbcSMatt Macy 		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
672eda14cbcSMatt Macy 	}
673eda14cbcSMatt Macy }
674eda14cbcSMatt Macy 
675eda14cbcSMatt Macy void
abd_update_linear_stats(abd_t * abd,abd_stats_op_t op)676eda14cbcSMatt Macy abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
677eda14cbcSMatt Macy {
678eda14cbcSMatt Macy 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
679eda14cbcSMatt Macy 	if (op == ABDSTAT_INCR) {
680eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_linear_cnt);
681eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
682eda14cbcSMatt Macy 	} else {
683eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
684eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
685eda14cbcSMatt Macy 	}
686eda14cbcSMatt Macy }
687eda14cbcSMatt Macy 
688eda14cbcSMatt Macy void
abd_verify_scatter(abd_t * abd)689eda14cbcSMatt Macy abd_verify_scatter(abd_t *abd)
690eda14cbcSMatt Macy {
691eda14cbcSMatt Macy 	size_t n;
692eda14cbcSMatt Macy 	int i = 0;
693eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
694eda14cbcSMatt Macy 
695eda14cbcSMatt Macy 	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
696eda14cbcSMatt Macy 	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
697eda14cbcSMatt Macy 	    ABD_SCATTER(abd).abd_sgl->length);
698eda14cbcSMatt Macy 	n = ABD_SCATTER(abd).abd_nents;
699eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, n, i) {
700eda14cbcSMatt Macy 		ASSERT3P(sg_page(sg), !=, NULL);
701eda14cbcSMatt Macy 	}
702eda14cbcSMatt Macy }
703eda14cbcSMatt Macy 
704eda14cbcSMatt Macy static void
abd_free_zero_scatter(void)705eda14cbcSMatt Macy abd_free_zero_scatter(void)
706eda14cbcSMatt Macy {
707eda14cbcSMatt Macy 	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
708eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
709eda14cbcSMatt Macy 	ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
710eda14cbcSMatt Macy 
711eda14cbcSMatt Macy 	abd_free_sg_table(abd_zero_scatter);
712eda14cbcSMatt Macy 	abd_free_struct(abd_zero_scatter);
713eda14cbcSMatt Macy 	abd_zero_scatter = NULL;
714eda14cbcSMatt Macy 	ASSERT3P(abd_zero_page, !=, NULL);
715eda14cbcSMatt Macy #if defined(_KERNEL)
716da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
717eda14cbcSMatt Macy 	abd_unmark_zfs_page(abd_zero_page);
718eda14cbcSMatt Macy 	__free_page(abd_zero_page);
719da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
720eda14cbcSMatt Macy #else
721dbd5678dSMartin Matuska 	umem_free_aligned(abd_zero_page, PAGESIZE);
722eda14cbcSMatt Macy #endif /* _KERNEL */
723eda14cbcSMatt Macy }
724eda14cbcSMatt Macy 
7250d8fe237SMartin Matuska static int
abd_kstats_update(kstat_t * ksp,int rw)7260d8fe237SMartin Matuska abd_kstats_update(kstat_t *ksp, int rw)
7270d8fe237SMartin Matuska {
7280d8fe237SMartin Matuska 	abd_stats_t *as = ksp->ks_data;
7290d8fe237SMartin Matuska 
7300d8fe237SMartin Matuska 	if (rw == KSTAT_WRITE)
7310d8fe237SMartin Matuska 		return (EACCES);
7320d8fe237SMartin Matuska 	as->abdstat_struct_size.value.ui64 =
7330d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_struct_size);
7340d8fe237SMartin Matuska 	as->abdstat_linear_cnt.value.ui64 =
7350d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_linear_cnt);
7360d8fe237SMartin Matuska 	as->abdstat_linear_data_size.value.ui64 =
7370d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_linear_data_size);
7380d8fe237SMartin Matuska 	as->abdstat_scatter_cnt.value.ui64 =
7390d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_cnt);
7400d8fe237SMartin Matuska 	as->abdstat_scatter_data_size.value.ui64 =
7410d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_data_size);
7420d8fe237SMartin Matuska 	as->abdstat_scatter_chunk_waste.value.ui64 =
7430d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
744fd45b686SMartin Matuska 	for (int i = 0; i < ABD_MAX_ORDER; i++) {
7450d8fe237SMartin Matuska 		as->abdstat_scatter_orders[i].value.ui64 =
7460d8fe237SMartin Matuska 		    wmsum_value(&abd_sums.abdstat_scatter_orders[i]);
7470d8fe237SMartin Matuska 	}
7480d8fe237SMartin Matuska 	as->abdstat_scatter_page_multi_chunk.value.ui64 =
7490d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk);
7500d8fe237SMartin Matuska 	as->abdstat_scatter_page_multi_zone.value.ui64 =
7510d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone);
7520d8fe237SMartin Matuska 	as->abdstat_scatter_page_alloc_retry.value.ui64 =
7530d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry);
7540d8fe237SMartin Matuska 	as->abdstat_scatter_sg_table_retry.value.ui64 =
7550d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry);
7560d8fe237SMartin Matuska 	return (0);
7570d8fe237SMartin Matuska }
7580d8fe237SMartin Matuska 
759eda14cbcSMatt Macy void
abd_init(void)760eda14cbcSMatt Macy abd_init(void)
761eda14cbcSMatt Macy {
762eda14cbcSMatt Macy 	int i;
763eda14cbcSMatt Macy 
764eda14cbcSMatt Macy 	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
765eda14cbcSMatt Macy 	    0, NULL, NULL, NULL, NULL, NULL, 0);
766eda14cbcSMatt Macy 
7670d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_struct_size, 0);
7680d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
7690d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
7700d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
7710d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
7720d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
773fd45b686SMartin Matuska 	for (i = 0; i < ABD_MAX_ORDER; i++)
7740d8fe237SMartin Matuska 		wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0);
7750d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0);
7760d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0);
7770d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0);
7780d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0);
7790d8fe237SMartin Matuska 
780eda14cbcSMatt Macy 	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
781eda14cbcSMatt Macy 	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
782eda14cbcSMatt Macy 	if (abd_ksp != NULL) {
783fd45b686SMartin Matuska 		for (i = 0; i < ABD_MAX_ORDER; i++) {
784eda14cbcSMatt Macy 			snprintf(abd_stats.abdstat_scatter_orders[i].name,
785eda14cbcSMatt Macy 			    KSTAT_STRLEN, "scatter_order_%d", i);
786eda14cbcSMatt Macy 			abd_stats.abdstat_scatter_orders[i].data_type =
787eda14cbcSMatt Macy 			    KSTAT_DATA_UINT64;
788eda14cbcSMatt Macy 		}
789eda14cbcSMatt Macy 		abd_ksp->ks_data = &abd_stats;
7900d8fe237SMartin Matuska 		abd_ksp->ks_update = abd_kstats_update;
791eda14cbcSMatt Macy 		kstat_install(abd_ksp);
792eda14cbcSMatt Macy 	}
793eda14cbcSMatt Macy 
794eda14cbcSMatt Macy 	abd_alloc_zero_scatter();
795eda14cbcSMatt Macy }
796eda14cbcSMatt Macy 
797eda14cbcSMatt Macy void
abd_fini(void)798eda14cbcSMatt Macy abd_fini(void)
799eda14cbcSMatt Macy {
800eda14cbcSMatt Macy 	abd_free_zero_scatter();
801eda14cbcSMatt Macy 
802eda14cbcSMatt Macy 	if (abd_ksp != NULL) {
803eda14cbcSMatt Macy 		kstat_delete(abd_ksp);
804eda14cbcSMatt Macy 		abd_ksp = NULL;
805eda14cbcSMatt Macy 	}
806eda14cbcSMatt Macy 
8070d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_struct_size);
8080d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_linear_cnt);
8090d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_linear_data_size);
8100d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_cnt);
8110d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_data_size);
8120d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
813fd45b686SMartin Matuska 	for (int i = 0; i < ABD_MAX_ORDER; i++)
8140d8fe237SMartin Matuska 		wmsum_fini(&abd_sums.abdstat_scatter_orders[i]);
8150d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk);
8160d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone);
8170d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry);
8180d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry);
8190d8fe237SMartin Matuska 
820eda14cbcSMatt Macy 	if (abd_cache) {
821eda14cbcSMatt Macy 		kmem_cache_destroy(abd_cache);
822eda14cbcSMatt Macy 		abd_cache = NULL;
823eda14cbcSMatt Macy 	}
824eda14cbcSMatt Macy }
825eda14cbcSMatt Macy 
826eda14cbcSMatt Macy void
abd_free_linear_page(abd_t * abd)827eda14cbcSMatt Macy abd_free_linear_page(abd_t *abd)
828eda14cbcSMatt Macy {
829eda14cbcSMatt Macy 	/* Transform it back into a scatter ABD for freeing */
830eda14cbcSMatt Macy 	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
831eda14cbcSMatt Macy 	abd->abd_flags &= ~ABD_FLAG_LINEAR;
832eda14cbcSMatt Macy 	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
833eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = 1;
834eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_offset = 0;
835eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = sg;
836eda14cbcSMatt Macy 	abd_free_chunks(abd);
837eda14cbcSMatt Macy 
838eda14cbcSMatt Macy 	abd_update_scatter_stats(abd, ABDSTAT_DECR);
839eda14cbcSMatt Macy }
840eda14cbcSMatt Macy 
841eda14cbcSMatt Macy /*
842eda14cbcSMatt Macy  * If we're going to use this ABD for doing I/O using the block layer, the
843eda14cbcSMatt Macy  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
844eda14cbcSMatt Macy  * plan to store this ABD in memory for a long period of time, we should
845eda14cbcSMatt Macy  * allocate the ABD type that requires the least data copying to do the I/O.
846eda14cbcSMatt Macy  *
847eda14cbcSMatt Macy  * On Linux the optimal thing to do would be to use abd_get_offset() and
848eda14cbcSMatt Macy  * construct a new ABD which shares the original pages thereby eliminating
849eda14cbcSMatt Macy  * the copy.  But for the moment a new linear ABD is allocated until this
850eda14cbcSMatt Macy  * performance optimization can be implemented.
851eda14cbcSMatt Macy  */
852eda14cbcSMatt Macy abd_t *
abd_alloc_for_io(size_t size,boolean_t is_metadata)853eda14cbcSMatt Macy abd_alloc_for_io(size_t size, boolean_t is_metadata)
854eda14cbcSMatt Macy {
855eda14cbcSMatt Macy 	return (abd_alloc(size, is_metadata));
856eda14cbcSMatt Macy }
857eda14cbcSMatt Macy 
858eda14cbcSMatt Macy abd_t *
abd_get_offset_scatter(abd_t * abd,abd_t * sabd,size_t off,size_t size)8597cd22ac4SMartin Matuska abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
8607cd22ac4SMartin Matuska     size_t size)
861eda14cbcSMatt Macy {
862e92ffd9bSMartin Matuska 	(void) size;
863eda14cbcSMatt Macy 	int i = 0;
864eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
865eda14cbcSMatt Macy 
866eda14cbcSMatt Macy 	abd_verify(sabd);
867eda14cbcSMatt Macy 	ASSERT3U(off, <=, sabd->abd_size);
868eda14cbcSMatt Macy 
869eda14cbcSMatt Macy 	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
870eda14cbcSMatt Macy 
871184c1b94SMartin Matuska 	if (abd == NULL)
872eda14cbcSMatt Macy 		abd = abd_alloc_struct(0);
873eda14cbcSMatt Macy 
874eda14cbcSMatt Macy 	/*
875eda14cbcSMatt Macy 	 * Even if this buf is filesystem metadata, we only track that
876eda14cbcSMatt Macy 	 * if we own the underlying data buffer, which is not true in
877eda14cbcSMatt Macy 	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
878eda14cbcSMatt Macy 	 */
879eda14cbcSMatt Macy 
880eda14cbcSMatt Macy 	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
881eda14cbcSMatt Macy 		if (new_offset < sg->length)
882eda14cbcSMatt Macy 			break;
883eda14cbcSMatt Macy 		new_offset -= sg->length;
884eda14cbcSMatt Macy 	}
885eda14cbcSMatt Macy 
886eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = sg;
887eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_offset = new_offset;
888eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
889eda14cbcSMatt Macy 
890eda14cbcSMatt Macy 	return (abd);
891eda14cbcSMatt Macy }
892eda14cbcSMatt Macy 
893eda14cbcSMatt Macy /*
894eda14cbcSMatt Macy  * Initialize the abd_iter.
895eda14cbcSMatt Macy  */
896eda14cbcSMatt Macy void
abd_iter_init(struct abd_iter * aiter,abd_t * abd)897eda14cbcSMatt Macy abd_iter_init(struct abd_iter *aiter, abd_t *abd)
898eda14cbcSMatt Macy {
899eda14cbcSMatt Macy 	ASSERT(!abd_is_gang(abd));
900eda14cbcSMatt Macy 	abd_verify(abd);
901783d3ff6SMartin Matuska 	memset(aiter, 0, sizeof (struct abd_iter));
902eda14cbcSMatt Macy 	aiter->iter_abd = abd;
903783d3ff6SMartin Matuska 	if (!abd_is_linear(abd)) {
904eda14cbcSMatt Macy 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
905eda14cbcSMatt Macy 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
906eda14cbcSMatt Macy 	}
907eda14cbcSMatt Macy }
908eda14cbcSMatt Macy 
909eda14cbcSMatt Macy /*
910eda14cbcSMatt Macy  * This is just a helper function to see if we have exhausted the
911eda14cbcSMatt Macy  * abd_iter and reached the end.
912eda14cbcSMatt Macy  */
913eda14cbcSMatt Macy boolean_t
abd_iter_at_end(struct abd_iter * aiter)914eda14cbcSMatt Macy abd_iter_at_end(struct abd_iter *aiter)
915eda14cbcSMatt Macy {
916783d3ff6SMartin Matuska 	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
917eda14cbcSMatt Macy 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
918eda14cbcSMatt Macy }
919eda14cbcSMatt Macy 
920eda14cbcSMatt Macy /*
921eda14cbcSMatt Macy  * Advance the iterator by a certain amount. Cannot be called when a chunk is
922eda14cbcSMatt Macy  * in use. This can be safely called when the aiter has already exhausted, in
923eda14cbcSMatt Macy  * which case this does nothing.
924eda14cbcSMatt Macy  */
925eda14cbcSMatt Macy void
abd_iter_advance(struct abd_iter * aiter,size_t amount)926eda14cbcSMatt Macy abd_iter_advance(struct abd_iter *aiter, size_t amount)
927eda14cbcSMatt Macy {
928783d3ff6SMartin Matuska 	/*
929783d3ff6SMartin Matuska 	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
930783d3ff6SMartin Matuska 	 * this state (directly or abd_iter_unmap()) before advancing.
931783d3ff6SMartin Matuska 	 */
932eda14cbcSMatt Macy 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
933eda14cbcSMatt Macy 	ASSERT0(aiter->iter_mapsize);
934783d3ff6SMartin Matuska 	ASSERT3P(aiter->iter_page, ==, NULL);
935783d3ff6SMartin Matuska 	ASSERT0(aiter->iter_page_doff);
936783d3ff6SMartin Matuska 	ASSERT0(aiter->iter_page_dsize);
937eda14cbcSMatt Macy 
938eda14cbcSMatt Macy 	/* There's nothing left to advance to, so do nothing */
939eda14cbcSMatt Macy 	if (abd_iter_at_end(aiter))
940eda14cbcSMatt Macy 		return;
941eda14cbcSMatt Macy 
942eda14cbcSMatt Macy 	aiter->iter_pos += amount;
943eda14cbcSMatt Macy 	aiter->iter_offset += amount;
944eda14cbcSMatt Macy 	if (!abd_is_linear(aiter->iter_abd)) {
945eda14cbcSMatt Macy 		while (aiter->iter_offset >= aiter->iter_sg->length) {
946eda14cbcSMatt Macy 			aiter->iter_offset -= aiter->iter_sg->length;
947eda14cbcSMatt Macy 			aiter->iter_sg = sg_next(aiter->iter_sg);
948eda14cbcSMatt Macy 			if (aiter->iter_sg == NULL) {
949eda14cbcSMatt Macy 				ASSERT0(aiter->iter_offset);
950eda14cbcSMatt Macy 				break;
951eda14cbcSMatt Macy 			}
952eda14cbcSMatt Macy 		}
953eda14cbcSMatt Macy 	}
954eda14cbcSMatt Macy }
955eda14cbcSMatt Macy 
956eda14cbcSMatt Macy /*
957eda14cbcSMatt Macy  * Map the current chunk into aiter. This can be safely called when the aiter
958eda14cbcSMatt Macy  * has already exhausted, in which case this does nothing.
959eda14cbcSMatt Macy  */
960eda14cbcSMatt Macy void
abd_iter_map(struct abd_iter * aiter)961eda14cbcSMatt Macy abd_iter_map(struct abd_iter *aiter)
962eda14cbcSMatt Macy {
963eda14cbcSMatt Macy 	void *paddr;
964eda14cbcSMatt Macy 	size_t offset = 0;
965eda14cbcSMatt Macy 
966eda14cbcSMatt Macy 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
967eda14cbcSMatt Macy 	ASSERT0(aiter->iter_mapsize);
968eda14cbcSMatt Macy 
969eda14cbcSMatt Macy 	/* There's nothing left to iterate over, so do nothing */
970eda14cbcSMatt Macy 	if (abd_iter_at_end(aiter))
971eda14cbcSMatt Macy 		return;
972eda14cbcSMatt Macy 
973eda14cbcSMatt Macy 	if (abd_is_linear(aiter->iter_abd)) {
974eda14cbcSMatt Macy 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
975eda14cbcSMatt Macy 		offset = aiter->iter_offset;
976eda14cbcSMatt Macy 		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
977eda14cbcSMatt Macy 		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
978eda14cbcSMatt Macy 	} else {
979eda14cbcSMatt Macy 		offset = aiter->iter_offset;
980eda14cbcSMatt Macy 		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
981eda14cbcSMatt Macy 		    aiter->iter_abd->abd_size - aiter->iter_pos);
982eda14cbcSMatt Macy 
983f9693befSMartin Matuska 		paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg));
984eda14cbcSMatt Macy 	}
985eda14cbcSMatt Macy 
986eda14cbcSMatt Macy 	aiter->iter_mapaddr = (char *)paddr + offset;
987eda14cbcSMatt Macy }
988eda14cbcSMatt Macy 
989eda14cbcSMatt Macy /*
990eda14cbcSMatt Macy  * Unmap the current chunk from aiter. This can be safely called when the aiter
991eda14cbcSMatt Macy  * has already exhausted, in which case this does nothing.
992eda14cbcSMatt Macy  */
993eda14cbcSMatt Macy void
abd_iter_unmap(struct abd_iter * aiter)994eda14cbcSMatt Macy abd_iter_unmap(struct abd_iter *aiter)
995eda14cbcSMatt Macy {
996eda14cbcSMatt Macy 	/* There's nothing left to unmap, so do nothing */
997eda14cbcSMatt Macy 	if (abd_iter_at_end(aiter))
998eda14cbcSMatt Macy 		return;
999eda14cbcSMatt Macy 
1000eda14cbcSMatt Macy 	if (!abd_is_linear(aiter->iter_abd)) {
1001eda14cbcSMatt Macy 		/* LINTED E_FUNC_SET_NOT_USED */
1002f9693befSMartin Matuska 		zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset);
1003eda14cbcSMatt Macy 	}
1004eda14cbcSMatt Macy 
1005eda14cbcSMatt Macy 	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
1006eda14cbcSMatt Macy 	ASSERT3U(aiter->iter_mapsize, >, 0);
1007eda14cbcSMatt Macy 
1008eda14cbcSMatt Macy 	aiter->iter_mapaddr = NULL;
1009eda14cbcSMatt Macy 	aiter->iter_mapsize = 0;
1010eda14cbcSMatt Macy }
1011eda14cbcSMatt Macy 
1012eda14cbcSMatt Macy void
abd_cache_reap_now(void)1013eda14cbcSMatt Macy abd_cache_reap_now(void)
1014eda14cbcSMatt Macy {
1015eda14cbcSMatt Macy }
1016eda14cbcSMatt Macy 
1017eda14cbcSMatt Macy #if defined(_KERNEL)
10180d4ad640SMartin Matuska 
1019eda14cbcSMatt Macy /*
10200d4ad640SMartin Matuska  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
10210d4ad640SMartin Matuska  * It yields the next page struct and data offset and size within it, without
1022783d3ff6SMartin Matuska  * mapping it into the address space.
1023783d3ff6SMartin Matuska  */
10240d4ad640SMartin Matuska 
10250d4ad640SMartin Matuska /*
10260d4ad640SMartin Matuska  * "Compound pages" are a group of pages that can be referenced from a single
10270d4ad640SMartin Matuska  * struct page *. Its organised as a "head" page, followed by a series of
10280d4ad640SMartin Matuska  * "tail" pages.
10290d4ad640SMartin Matuska  *
10300d4ad640SMartin Matuska  * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
10310d4ad640SMartin Matuska  * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
10320d4ad640SMartin Matuska  * great many of the IO buffers we get are going to be of this type.
10330d4ad640SMartin Matuska  *
10340d4ad640SMartin Matuska  * The tail pages are just regular PAGESIZE pages, and can be safely used
10350d4ad640SMartin Matuska  * as-is. However, the head page has length covering itself and all the tail
10360d4ad640SMartin Matuska  * pages. If the ABD chunk spans multiple pages, then we can use the head page
10370d4ad640SMartin Matuska  * and a >PAGESIZE length, which is far more efficient.
10380d4ad640SMartin Matuska  *
10390d4ad640SMartin Matuska  * Before kernel 4.5 however, compound page heads were refcounted separately
10400d4ad640SMartin Matuska  * from tail pages, such that moving back to the head page would require us to
10410d4ad640SMartin Matuska  * take a reference to it and releasing it once we're completely finished with
10420d4ad640SMartin Matuska  * it. In practice, that means when our caller is done with the ABD, which we
10430d4ad640SMartin Matuska  * have no insight into from here. Rather than contort this API to track head
10440d4ad640SMartin Matuska  * page references on such ancient kernels, we disable this special compound
10450d4ad640SMartin Matuska  * page handling on 4.5, instead just using treating each page within it as a
10460d4ad640SMartin Matuska  * regular PAGESIZE page (which it is). This is slightly less efficient, but
10470d4ad640SMartin Matuska  * makes everything far simpler.
10480d4ad640SMartin Matuska  *
10490d4ad640SMartin Matuska  * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the
10500d4ad640SMartin Matuska  * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to
10510d4ad640SMartin Matuska  * understand compound pages, or not, as required.
10520d4ad640SMartin Matuska  */
10530d4ad640SMartin Matuska #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
10540d4ad640SMartin Matuska #define	ABD_ITER_COMPOUND_PAGES		1
10550d4ad640SMartin Matuska #define	ABD_ITER_PAGE_SIZE(page)	\
10560d4ad640SMartin Matuska 	(PageCompound(page) ? page_size(page) : PAGESIZE)
10570d4ad640SMartin Matuska #else
10580d4ad640SMartin Matuska #undef ABD_ITER_COMPOUND_PAGES
10590d4ad640SMartin Matuska #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
10600d4ad640SMartin Matuska #endif
10610d4ad640SMartin Matuska 
1062783d3ff6SMartin Matuska void
abd_iter_page(struct abd_iter * aiter)1063783d3ff6SMartin Matuska abd_iter_page(struct abd_iter *aiter)
1064783d3ff6SMartin Matuska {
1065783d3ff6SMartin Matuska 	if (abd_iter_at_end(aiter)) {
1066783d3ff6SMartin Matuska 		aiter->iter_page = NULL;
1067783d3ff6SMartin Matuska 		aiter->iter_page_doff = 0;
1068783d3ff6SMartin Matuska 		aiter->iter_page_dsize = 0;
1069783d3ff6SMartin Matuska 		return;
1070783d3ff6SMartin Matuska 	}
1071783d3ff6SMartin Matuska 
1072783d3ff6SMartin Matuska 	struct page *page;
1073783d3ff6SMartin Matuska 	size_t doff, dsize;
1074783d3ff6SMartin Matuska 
10750d4ad640SMartin Matuska 	/*
10760d4ad640SMartin Matuska 	 * Find the page, and the start of the data within it. This is computed
10770d4ad640SMartin Matuska 	 * differently for linear and scatter ABDs; linear is referenced by
10780d4ad640SMartin Matuska 	 * virtual memory location, while scatter is referenced by page
10790d4ad640SMartin Matuska 	 * pointer.
10800d4ad640SMartin Matuska 	 */
1081783d3ff6SMartin Matuska 	if (abd_is_linear(aiter->iter_abd)) {
1082783d3ff6SMartin Matuska 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
1083783d3ff6SMartin Matuska 
1084783d3ff6SMartin Matuska 		/* memory address at iter_pos */
1085783d3ff6SMartin Matuska 		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
1086783d3ff6SMartin Matuska 
1087783d3ff6SMartin Matuska 		/* struct page for address */
1088783d3ff6SMartin Matuska 		page = is_vmalloc_addr(paddr) ?
1089783d3ff6SMartin Matuska 		    vmalloc_to_page(paddr) : virt_to_page(paddr);
1090783d3ff6SMartin Matuska 
1091783d3ff6SMartin Matuska 		/* offset of address within the page */
1092783d3ff6SMartin Matuska 		doff = offset_in_page(paddr);
1093783d3ff6SMartin Matuska 	} else {
1094783d3ff6SMartin Matuska 		ASSERT(!abd_is_gang(aiter->iter_abd));
1095783d3ff6SMartin Matuska 
1096783d3ff6SMartin Matuska 		/* current scatter page */
10970d4ad640SMartin Matuska 		page = nth_page(sg_page(aiter->iter_sg),
10980d4ad640SMartin Matuska 		    aiter->iter_offset >> PAGE_SHIFT);
1099783d3ff6SMartin Matuska 
1100783d3ff6SMartin Matuska 		/* position within page */
11010d4ad640SMartin Matuska 		doff = aiter->iter_offset & (PAGESIZE - 1);
1102783d3ff6SMartin Matuska 	}
1103783d3ff6SMartin Matuska 
11040d4ad640SMartin Matuska #ifdef ABD_ITER_COMPOUND_PAGES
1105783d3ff6SMartin Matuska 	if (PageTail(page)) {
1106783d3ff6SMartin Matuska 		/*
11070d4ad640SMartin Matuska 		 * If this is a compound tail page, move back to the head, and
11080d4ad640SMartin Matuska 		 * adjust the offset to match. This may let us yield a much
11090d4ad640SMartin Matuska 		 * larger amount of data from a single logical page, and so
11100d4ad640SMartin Matuska 		 * leave our caller with fewer pages to process.
1111783d3ff6SMartin Matuska 		 */
1112783d3ff6SMartin Matuska 		struct page *head = compound_head(page);
1113783d3ff6SMartin Matuska 		doff += ((page - head) * PAGESIZE);
1114783d3ff6SMartin Matuska 		page = head;
1115783d3ff6SMartin Matuska 	}
1116783d3ff6SMartin Matuska #endif
1117783d3ff6SMartin Matuska 
11180d4ad640SMartin Matuska 	ASSERT(page);
11190d4ad640SMartin Matuska 
11200d4ad640SMartin Matuska 	/*
11210d4ad640SMartin Matuska 	 * Compute the maximum amount of data we can take from this page. This
11220d4ad640SMartin Matuska 	 * is the smaller of:
11230d4ad640SMartin Matuska 	 * - the remaining space in the page
11240d4ad640SMartin Matuska 	 * - the remaining space in this scatterlist entry (which may not cover
11250d4ad640SMartin Matuska 	 *   the entire page)
11260d4ad640SMartin Matuska 	 * - the remaining space in the abd (which may not cover the entire
11270d4ad640SMartin Matuska 	 *   scatterlist entry)
11280d4ad640SMartin Matuska 	 */
11290d4ad640SMartin Matuska 	dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
11300d4ad640SMartin Matuska 	    aiter->iter_abd->abd_size - aiter->iter_pos);
11310d4ad640SMartin Matuska 	if (!abd_is_linear(aiter->iter_abd))
11320d4ad640SMartin Matuska 		dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
11330d4ad640SMartin Matuska 	ASSERT3U(dsize, >, 0);
11340d4ad640SMartin Matuska 
11350d4ad640SMartin Matuska 	/* final iterator outputs */
1136783d3ff6SMartin Matuska 	aiter->iter_page = page;
1137783d3ff6SMartin Matuska 	aiter->iter_page_doff = doff;
11380d4ad640SMartin Matuska 	aiter->iter_page_dsize = dsize;
1139783d3ff6SMartin Matuska }
1140783d3ff6SMartin Matuska 
1141783d3ff6SMartin Matuska /*
1142783d3ff6SMartin Matuska  * Note: ABD BIO functions only needed to support vdev_classic. See comments in
1143783d3ff6SMartin Matuska  * vdev_disk.c.
1144783d3ff6SMartin Matuska  */
1145783d3ff6SMartin Matuska 
1146783d3ff6SMartin Matuska /*
1147eda14cbcSMatt Macy  * bio_nr_pages for ABD.
1148eda14cbcSMatt Macy  * @off is the offset in @abd
1149eda14cbcSMatt Macy  */
1150eda14cbcSMatt Macy unsigned long
abd_nr_pages_off(abd_t * abd,unsigned int size,size_t off)1151eda14cbcSMatt Macy abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1152eda14cbcSMatt Macy {
1153eda14cbcSMatt Macy 	unsigned long pos;
1154eda14cbcSMatt Macy 
1155184c1b94SMartin Matuska 	if (abd_is_gang(abd)) {
1156184c1b94SMartin Matuska 		unsigned long count = 0;
1157eda14cbcSMatt Macy 
1158184c1b94SMartin Matuska 		for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1159184c1b94SMartin Matuska 		    cabd != NULL && size != 0;
1160184c1b94SMartin Matuska 		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1161184c1b94SMartin Matuska 			ASSERT3U(off, <, cabd->abd_size);
1162184c1b94SMartin Matuska 			int mysize = MIN(size, cabd->abd_size - off);
1163184c1b94SMartin Matuska 			count += abd_nr_pages_off(cabd, mysize, off);
1164184c1b94SMartin Matuska 			size -= mysize;
1165184c1b94SMartin Matuska 			off = 0;
1166184c1b94SMartin Matuska 		}
1167184c1b94SMartin Matuska 		return (count);
1168184c1b94SMartin Matuska 	}
1169184c1b94SMartin Matuska 
1170eda14cbcSMatt Macy 	if (abd_is_linear(abd))
1171eda14cbcSMatt Macy 		pos = (unsigned long)abd_to_buf(abd) + off;
1172eda14cbcSMatt Macy 	else
1173eda14cbcSMatt Macy 		pos = ABD_SCATTER(abd).abd_offset + off;
1174eda14cbcSMatt Macy 
1175184c1b94SMartin Matuska 	return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
1176184c1b94SMartin Matuska 	    (pos >> PAGE_SHIFT));
1177eda14cbcSMatt Macy }
1178eda14cbcSMatt Macy 
1179eda14cbcSMatt Macy static unsigned int
bio_map(struct bio * bio,void * buf_ptr,unsigned int bio_size)1180eda14cbcSMatt Macy bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
1181eda14cbcSMatt Macy {
1182eda14cbcSMatt Macy 	unsigned int offset, size, i;
1183eda14cbcSMatt Macy 	struct page *page;
1184eda14cbcSMatt Macy 
1185eda14cbcSMatt Macy 	offset = offset_in_page(buf_ptr);
1186eda14cbcSMatt Macy 	for (i = 0; i < bio->bi_max_vecs; i++) {
1187eda14cbcSMatt Macy 		size = PAGE_SIZE - offset;
1188eda14cbcSMatt Macy 
1189eda14cbcSMatt Macy 		if (bio_size <= 0)
1190eda14cbcSMatt Macy 			break;
1191eda14cbcSMatt Macy 
1192eda14cbcSMatt Macy 		if (size > bio_size)
1193eda14cbcSMatt Macy 			size = bio_size;
1194eda14cbcSMatt Macy 
1195eda14cbcSMatt Macy 		if (is_vmalloc_addr(buf_ptr))
1196eda14cbcSMatt Macy 			page = vmalloc_to_page(buf_ptr);
1197eda14cbcSMatt Macy 		else
1198eda14cbcSMatt Macy 			page = virt_to_page(buf_ptr);
1199eda14cbcSMatt Macy 
1200eda14cbcSMatt Macy 		/*
1201eda14cbcSMatt Macy 		 * Some network related block device uses tcp_sendpage, which
1202eda14cbcSMatt Macy 		 * doesn't behave well when using 0-count page, this is a
1203eda14cbcSMatt Macy 		 * safety net to catch them.
1204eda14cbcSMatt Macy 		 */
1205eda14cbcSMatt Macy 		ASSERT3S(page_count(page), >, 0);
1206eda14cbcSMatt Macy 
1207eda14cbcSMatt Macy 		if (bio_add_page(bio, page, size, offset) != size)
1208eda14cbcSMatt Macy 			break;
1209eda14cbcSMatt Macy 
1210eda14cbcSMatt Macy 		buf_ptr += size;
1211eda14cbcSMatt Macy 		bio_size -= size;
1212eda14cbcSMatt Macy 		offset = 0;
1213eda14cbcSMatt Macy 	}
1214eda14cbcSMatt Macy 
1215eda14cbcSMatt Macy 	return (bio_size);
1216eda14cbcSMatt Macy }
1217eda14cbcSMatt Macy 
1218eda14cbcSMatt Macy /*
1219eda14cbcSMatt Macy  * bio_map for gang ABD.
1220eda14cbcSMatt Macy  */
1221eda14cbcSMatt Macy static unsigned int
abd_gang_bio_map_off(struct bio * bio,abd_t * abd,unsigned int io_size,size_t off)1222eda14cbcSMatt Macy abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
1223eda14cbcSMatt Macy     unsigned int io_size, size_t off)
1224eda14cbcSMatt Macy {
1225eda14cbcSMatt Macy 	ASSERT(abd_is_gang(abd));
1226eda14cbcSMatt Macy 
1227eda14cbcSMatt Macy 	for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1228eda14cbcSMatt Macy 	    cabd != NULL;
1229eda14cbcSMatt Macy 	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1230eda14cbcSMatt Macy 		ASSERT3U(off, <, cabd->abd_size);
1231eda14cbcSMatt Macy 		int size = MIN(io_size, cabd->abd_size - off);
1232eda14cbcSMatt Macy 		int remainder = abd_bio_map_off(bio, cabd, size, off);
1233eda14cbcSMatt Macy 		io_size -= (size - remainder);
1234eda14cbcSMatt Macy 		if (io_size == 0 || remainder > 0)
1235eda14cbcSMatt Macy 			return (io_size);
1236eda14cbcSMatt Macy 		off = 0;
1237eda14cbcSMatt Macy 	}
1238eda14cbcSMatt Macy 	ASSERT0(io_size);
1239eda14cbcSMatt Macy 	return (io_size);
1240eda14cbcSMatt Macy }
1241eda14cbcSMatt Macy 
1242eda14cbcSMatt Macy /*
1243eda14cbcSMatt Macy  * bio_map for ABD.
1244eda14cbcSMatt Macy  * @off is the offset in @abd
1245eda14cbcSMatt Macy  * Remaining IO size is returned
1246eda14cbcSMatt Macy  */
1247eda14cbcSMatt Macy unsigned int
abd_bio_map_off(struct bio * bio,abd_t * abd,unsigned int io_size,size_t off)1248eda14cbcSMatt Macy abd_bio_map_off(struct bio *bio, abd_t *abd,
1249eda14cbcSMatt Macy     unsigned int io_size, size_t off)
1250eda14cbcSMatt Macy {
1251eda14cbcSMatt Macy 	struct abd_iter aiter;
1252eda14cbcSMatt Macy 
1253eda14cbcSMatt Macy 	ASSERT3U(io_size, <=, abd->abd_size - off);
1254eda14cbcSMatt Macy 	if (abd_is_linear(abd))
1255eda14cbcSMatt Macy 		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
1256eda14cbcSMatt Macy 
1257eda14cbcSMatt Macy 	ASSERT(!abd_is_linear(abd));
1258eda14cbcSMatt Macy 	if (abd_is_gang(abd))
1259eda14cbcSMatt Macy 		return (abd_gang_bio_map_off(bio, abd, io_size, off));
1260eda14cbcSMatt Macy 
1261eda14cbcSMatt Macy 	abd_iter_init(&aiter, abd);
1262eda14cbcSMatt Macy 	abd_iter_advance(&aiter, off);
1263eda14cbcSMatt Macy 
1264184c1b94SMartin Matuska 	for (int i = 0; i < bio->bi_max_vecs; i++) {
1265eda14cbcSMatt Macy 		struct page *pg;
1266eda14cbcSMatt Macy 		size_t len, sgoff, pgoff;
1267eda14cbcSMatt Macy 		struct scatterlist *sg;
1268eda14cbcSMatt Macy 
1269eda14cbcSMatt Macy 		if (io_size <= 0)
1270eda14cbcSMatt Macy 			break;
1271eda14cbcSMatt Macy 
1272eda14cbcSMatt Macy 		sg = aiter.iter_sg;
1273eda14cbcSMatt Macy 		sgoff = aiter.iter_offset;
1274eda14cbcSMatt Macy 		pgoff = sgoff & (PAGESIZE - 1);
1275eda14cbcSMatt Macy 		len = MIN(io_size, PAGESIZE - pgoff);
1276eda14cbcSMatt Macy 		ASSERT(len > 0);
1277eda14cbcSMatt Macy 
1278eda14cbcSMatt Macy 		pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1279eda14cbcSMatt Macy 		if (bio_add_page(bio, pg, len, pgoff) != len)
1280eda14cbcSMatt Macy 			break;
1281eda14cbcSMatt Macy 
1282eda14cbcSMatt Macy 		io_size -= len;
1283eda14cbcSMatt Macy 		abd_iter_advance(&aiter, len);
1284eda14cbcSMatt Macy 	}
1285eda14cbcSMatt Macy 
1286eda14cbcSMatt Macy 	return (io_size);
1287eda14cbcSMatt Macy }
1288eda14cbcSMatt Macy 
1289eda14cbcSMatt Macy /* Tunable Parameters */
1290eda14cbcSMatt Macy module_param(zfs_abd_scatter_enabled, int, 0644);
1291eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1292eda14cbcSMatt Macy 	"Toggle whether ABD allocations must be linear.");
1293eda14cbcSMatt Macy module_param(zfs_abd_scatter_min_size, int, 0644);
1294eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_min_size,
1295eda14cbcSMatt Macy 	"Minimum size of scatter allocations.");
1296eda14cbcSMatt Macy /* CSTYLED */
1297eda14cbcSMatt Macy module_param(zfs_abd_scatter_max_order, uint, 0644);
1298eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1299eda14cbcSMatt Macy 	"Maximum order allocation used for a scatter ABD.");
1300783d3ff6SMartin Matuska 
1301783d3ff6SMartin Matuska #endif /* _KERNEL */
1302