1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 23eda14cbcSMatt Macy * Copyright (c) 2019 by Delphix. All rights reserved. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * See abd.c for a general overview of the arc buffered data (ABD). 28eda14cbcSMatt Macy * 29eda14cbcSMatt Macy * Linear buffers act exactly like normal buffers and are always mapped into the 30eda14cbcSMatt Macy * kernel's virtual memory space, while scattered ABD data chunks are allocated 31eda14cbcSMatt Macy * as physical pages and then mapped in only while they are actually being 32eda14cbcSMatt Macy * accessed through one of the abd_* library functions. Using scattered ABDs 33eda14cbcSMatt Macy * provides several benefits: 34eda14cbcSMatt Macy * 35eda14cbcSMatt Macy * (1) They avoid use of kmem_*, preventing performance problems where running 36eda14cbcSMatt Macy * kmem_reap on very large memory systems never finishes and causes 37eda14cbcSMatt Macy * constant TLB shootdowns. 38eda14cbcSMatt Macy * 39eda14cbcSMatt Macy * (2) Fragmentation is less of an issue since when we are at the limit of 40eda14cbcSMatt Macy * allocatable space, we won't have to search around for a long free 41eda14cbcSMatt Macy * hole in the VA space for large ARC allocations. Each chunk is mapped in 42eda14cbcSMatt Macy * individually, so even if we are using HIGHMEM (see next point) we 43eda14cbcSMatt Macy * wouldn't need to worry about finding a contiguous address range. 44eda14cbcSMatt Macy * 45eda14cbcSMatt Macy * (3) If we are not using HIGHMEM, then all physical memory is always 46eda14cbcSMatt Macy * mapped into the kernel's address space, so we also avoid the map / 47eda14cbcSMatt Macy * unmap costs on each ABD access. 48eda14cbcSMatt Macy * 49eda14cbcSMatt Macy * If we are not using HIGHMEM, scattered buffers which have only one chunk 50eda14cbcSMatt Macy * can be treated as linear buffers, because they are contiguous in the 51eda14cbcSMatt Macy * kernel's virtual address space. See abd_alloc_chunks() for details. 52eda14cbcSMatt Macy */ 53eda14cbcSMatt Macy 54eda14cbcSMatt Macy #include <sys/abd_impl.h> 55eda14cbcSMatt Macy #include <sys/param.h> 56eda14cbcSMatt Macy #include <sys/zio.h> 57eda14cbcSMatt Macy #include <sys/arc.h> 58eda14cbcSMatt Macy #include <sys/zfs_context.h> 59eda14cbcSMatt Macy #include <sys/zfs_znode.h> 60eda14cbcSMatt Macy #ifdef _KERNEL 61eda14cbcSMatt Macy #include <linux/kmap_compat.h> 62eda14cbcSMatt Macy #include <linux/scatterlist.h> 63eda14cbcSMatt Macy #else 64eda14cbcSMatt Macy #define MAX_ORDER 1 65eda14cbcSMatt Macy #endif 66eda14cbcSMatt Macy 67eda14cbcSMatt Macy typedef struct abd_stats { 68eda14cbcSMatt Macy kstat_named_t abdstat_struct_size; 69eda14cbcSMatt Macy kstat_named_t abdstat_linear_cnt; 70eda14cbcSMatt Macy kstat_named_t abdstat_linear_data_size; 71eda14cbcSMatt Macy kstat_named_t abdstat_scatter_cnt; 72eda14cbcSMatt Macy kstat_named_t abdstat_scatter_data_size; 73eda14cbcSMatt Macy kstat_named_t abdstat_scatter_chunk_waste; 74eda14cbcSMatt Macy kstat_named_t abdstat_scatter_orders[MAX_ORDER]; 75eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_multi_chunk; 76eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_multi_zone; 77eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_alloc_retry; 78eda14cbcSMatt Macy kstat_named_t abdstat_scatter_sg_table_retry; 79eda14cbcSMatt Macy } abd_stats_t; 80eda14cbcSMatt Macy 81eda14cbcSMatt Macy static abd_stats_t abd_stats = { 82eda14cbcSMatt Macy /* Amount of memory occupied by all of the abd_t struct allocations */ 83eda14cbcSMatt Macy { "struct_size", KSTAT_DATA_UINT64 }, 84eda14cbcSMatt Macy /* 85eda14cbcSMatt Macy * The number of linear ABDs which are currently allocated, excluding 86eda14cbcSMatt Macy * ABDs which don't own their data (for instance the ones which were 87eda14cbcSMatt Macy * allocated through abd_get_offset() and abd_get_from_buf()). If an 88eda14cbcSMatt Macy * ABD takes ownership of its buf then it will become tracked. 89eda14cbcSMatt Macy */ 90eda14cbcSMatt Macy { "linear_cnt", KSTAT_DATA_UINT64 }, 91eda14cbcSMatt Macy /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 92eda14cbcSMatt Macy { "linear_data_size", KSTAT_DATA_UINT64 }, 93eda14cbcSMatt Macy /* 94eda14cbcSMatt Macy * The number of scatter ABDs which are currently allocated, excluding 95eda14cbcSMatt Macy * ABDs which don't own their data (for instance the ones which were 96eda14cbcSMatt Macy * allocated through abd_get_offset()). 97eda14cbcSMatt Macy */ 98eda14cbcSMatt Macy { "scatter_cnt", KSTAT_DATA_UINT64 }, 99eda14cbcSMatt Macy /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 100eda14cbcSMatt Macy { "scatter_data_size", KSTAT_DATA_UINT64 }, 101eda14cbcSMatt Macy /* 102eda14cbcSMatt Macy * The amount of space wasted at the end of the last chunk across all 103eda14cbcSMatt Macy * scatter ABDs tracked by scatter_cnt. 104eda14cbcSMatt Macy */ 105eda14cbcSMatt Macy { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 106eda14cbcSMatt Macy /* 107eda14cbcSMatt Macy * The number of compound allocations of a given order. These 108eda14cbcSMatt Macy * allocations are spread over all currently allocated ABDs, and 109eda14cbcSMatt Macy * act as a measure of memory fragmentation. 110eda14cbcSMatt Macy */ 111eda14cbcSMatt Macy { { "scatter_order_N", KSTAT_DATA_UINT64 } }, 112eda14cbcSMatt Macy /* 113eda14cbcSMatt Macy * The number of scatter ABDs which contain multiple chunks. 114eda14cbcSMatt Macy * ABDs are preferentially allocated from the minimum number of 115eda14cbcSMatt Macy * contiguous multi-page chunks, a single chunk is optimal. 116eda14cbcSMatt Macy */ 117eda14cbcSMatt Macy { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, 118eda14cbcSMatt Macy /* 119eda14cbcSMatt Macy * The number of scatter ABDs which are split across memory zones. 120eda14cbcSMatt Macy * ABDs are preferentially allocated using pages from a single zone. 121eda14cbcSMatt Macy */ 122eda14cbcSMatt Macy { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, 123eda14cbcSMatt Macy /* 124eda14cbcSMatt Macy * The total number of retries encountered when attempting to 125eda14cbcSMatt Macy * allocate the pages to populate the scatter ABD. 126eda14cbcSMatt Macy */ 127eda14cbcSMatt Macy { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, 128eda14cbcSMatt Macy /* 129eda14cbcSMatt Macy * The total number of retries encountered when attempting to 130eda14cbcSMatt Macy * allocate the sg table for an ABD. 131eda14cbcSMatt Macy */ 132eda14cbcSMatt Macy { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, 133eda14cbcSMatt Macy }; 134eda14cbcSMatt Macy 135dbd5678dSMartin Matuska static struct { 1360d8fe237SMartin Matuska wmsum_t abdstat_struct_size; 1370d8fe237SMartin Matuska wmsum_t abdstat_linear_cnt; 1380d8fe237SMartin Matuska wmsum_t abdstat_linear_data_size; 1390d8fe237SMartin Matuska wmsum_t abdstat_scatter_cnt; 1400d8fe237SMartin Matuska wmsum_t abdstat_scatter_data_size; 1410d8fe237SMartin Matuska wmsum_t abdstat_scatter_chunk_waste; 1420d8fe237SMartin Matuska wmsum_t abdstat_scatter_orders[MAX_ORDER]; 1430d8fe237SMartin Matuska wmsum_t abdstat_scatter_page_multi_chunk; 1440d8fe237SMartin Matuska wmsum_t abdstat_scatter_page_multi_zone; 1450d8fe237SMartin Matuska wmsum_t abdstat_scatter_page_alloc_retry; 1460d8fe237SMartin Matuska wmsum_t abdstat_scatter_sg_table_retry; 1470d8fe237SMartin Matuska } abd_sums; 1480d8fe237SMartin Matuska 149eda14cbcSMatt Macy #define abd_for_each_sg(abd, sg, n, i) \ 150eda14cbcSMatt Macy for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) 151eda14cbcSMatt Macy 152eda14cbcSMatt Macy /* 153eda14cbcSMatt Macy * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 154eda14cbcSMatt Macy * ABD's. Smaller allocations will use linear ABD's which uses 155eda14cbcSMatt Macy * zio_[data_]buf_alloc(). 156eda14cbcSMatt Macy * 157eda14cbcSMatt Macy * Scatter ABD's use at least one page each, so sub-page allocations waste 158eda14cbcSMatt Macy * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 159eda14cbcSMatt Macy * half of each page). Using linear ABD's for small allocations means that 160eda14cbcSMatt Macy * they will be put on slabs which contain many allocations. This can 161eda14cbcSMatt Macy * improve memory efficiency, but it also makes it much harder for ARC 162eda14cbcSMatt Macy * evictions to actually free pages, because all the buffers on one slab need 163eda14cbcSMatt Macy * to be freed in order for the slab (and underlying pages) to be freed. 164eda14cbcSMatt Macy * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 165eda14cbcSMatt Macy * possible for them to actually waste more memory than scatter (one page per 166eda14cbcSMatt Macy * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 167eda14cbcSMatt Macy * 168eda14cbcSMatt Macy * Spill blocks are typically 512B and are heavily used on systems running 169eda14cbcSMatt Macy * selinux with the default dnode size and the `xattr=sa` property set. 170eda14cbcSMatt Macy * 171eda14cbcSMatt Macy * By default we use linear allocations for 512B and 1KB, and scatter 172eda14cbcSMatt Macy * allocations for larger (1.5KB and up). 173eda14cbcSMatt Macy */ 174e92ffd9bSMartin Matuska static int zfs_abd_scatter_min_size = 512 * 3; 175eda14cbcSMatt Macy 176eda14cbcSMatt Macy /* 177eda14cbcSMatt Macy * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are 178eda14cbcSMatt Macy * just a single zero'd page. This allows us to conserve memory by 179eda14cbcSMatt Macy * only using a single zero page for the scatterlist. 180eda14cbcSMatt Macy */ 181eda14cbcSMatt Macy abd_t *abd_zero_scatter = NULL; 182eda14cbcSMatt Macy 183eda14cbcSMatt Macy struct page; 184eda14cbcSMatt Macy /* 185da5137abSMartin Matuska * _KERNEL - Will point to ZERO_PAGE if it is available or it will be 186da5137abSMartin Matuska * an allocated zero'd PAGESIZE buffer. 187da5137abSMartin Matuska * Userspace - Will be an allocated zero'ed PAGESIZE buffer. 188da5137abSMartin Matuska * 189da5137abSMartin Matuska * abd_zero_page is assigned to each of the pages of abd_zero_scatter. 190eda14cbcSMatt Macy */ 191eda14cbcSMatt Macy static struct page *abd_zero_page = NULL; 192eda14cbcSMatt Macy 193eda14cbcSMatt Macy static kmem_cache_t *abd_cache = NULL; 194eda14cbcSMatt Macy static kstat_t *abd_ksp; 195eda14cbcSMatt Macy 1967877fdebSMatt Macy static uint_t 197eda14cbcSMatt Macy abd_chunkcnt_for_bytes(size_t size) 198eda14cbcSMatt Macy { 199eda14cbcSMatt Macy return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); 200eda14cbcSMatt Macy } 201eda14cbcSMatt Macy 202eda14cbcSMatt Macy abd_t * 203184c1b94SMartin Matuska abd_alloc_struct_impl(size_t size) 204eda14cbcSMatt Macy { 205eda14cbcSMatt Macy /* 206eda14cbcSMatt Macy * In Linux we do not use the size passed in during ABD 207eda14cbcSMatt Macy * allocation, so we just ignore it. 208eda14cbcSMatt Macy */ 209e92ffd9bSMartin Matuska (void) size; 210eda14cbcSMatt Macy abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); 211eda14cbcSMatt Macy ASSERT3P(abd, !=, NULL); 212eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); 213eda14cbcSMatt Macy 214eda14cbcSMatt Macy return (abd); 215eda14cbcSMatt Macy } 216eda14cbcSMatt Macy 217eda14cbcSMatt Macy void 218184c1b94SMartin Matuska abd_free_struct_impl(abd_t *abd) 219eda14cbcSMatt Macy { 220eda14cbcSMatt Macy kmem_cache_free(abd_cache, abd); 221eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); 222eda14cbcSMatt Macy } 223eda14cbcSMatt Macy 224eda14cbcSMatt Macy #ifdef _KERNEL 225e92ffd9bSMartin Matuska static unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; 226e92ffd9bSMartin Matuska 227eda14cbcSMatt Macy /* 228eda14cbcSMatt Macy * Mark zfs data pages so they can be excluded from kernel crash dumps 229eda14cbcSMatt Macy */ 230eda14cbcSMatt Macy #ifdef _LP64 231eda14cbcSMatt Macy #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E 232eda14cbcSMatt Macy 233eda14cbcSMatt Macy static inline void 234eda14cbcSMatt Macy abd_mark_zfs_page(struct page *page) 235eda14cbcSMatt Macy { 236eda14cbcSMatt Macy get_page(page); 237eda14cbcSMatt Macy SetPagePrivate(page); 238eda14cbcSMatt Macy set_page_private(page, ABD_FILE_CACHE_PAGE); 239eda14cbcSMatt Macy } 240eda14cbcSMatt Macy 241eda14cbcSMatt Macy static inline void 242eda14cbcSMatt Macy abd_unmark_zfs_page(struct page *page) 243eda14cbcSMatt Macy { 244eda14cbcSMatt Macy set_page_private(page, 0UL); 245eda14cbcSMatt Macy ClearPagePrivate(page); 246eda14cbcSMatt Macy put_page(page); 247eda14cbcSMatt Macy } 248eda14cbcSMatt Macy #else 249eda14cbcSMatt Macy #define abd_mark_zfs_page(page) 250eda14cbcSMatt Macy #define abd_unmark_zfs_page(page) 251eda14cbcSMatt Macy #endif /* _LP64 */ 252eda14cbcSMatt Macy 253eda14cbcSMatt Macy #ifndef CONFIG_HIGHMEM 254eda14cbcSMatt Macy 255eda14cbcSMatt Macy #ifndef __GFP_RECLAIM 256eda14cbcSMatt Macy #define __GFP_RECLAIM __GFP_WAIT 257eda14cbcSMatt Macy #endif 258eda14cbcSMatt Macy 259eda14cbcSMatt Macy /* 260eda14cbcSMatt Macy * The goal is to minimize fragmentation by preferentially populating ABDs 261eda14cbcSMatt Macy * with higher order compound pages from a single zone. Allocation size is 262eda14cbcSMatt Macy * progressively decreased until it can be satisfied without performing 263eda14cbcSMatt Macy * reclaim or compaction. When necessary this function will degenerate to 264eda14cbcSMatt Macy * allocating individual pages and allowing reclaim to satisfy allocations. 265eda14cbcSMatt Macy */ 266eda14cbcSMatt Macy void 267eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 268eda14cbcSMatt Macy { 269eda14cbcSMatt Macy struct list_head pages; 270eda14cbcSMatt Macy struct sg_table table; 271eda14cbcSMatt Macy struct scatterlist *sg; 272eda14cbcSMatt Macy struct page *page, *tmp_page = NULL; 273eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 274eda14cbcSMatt Macy gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; 275c9539b89SMartin Matuska unsigned int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); 276c9539b89SMartin Matuska unsigned int nr_pages = abd_chunkcnt_for_bytes(size); 277c9539b89SMartin Matuska unsigned int chunks = 0, zones = 0; 278eda14cbcSMatt Macy size_t remaining_size; 279eda14cbcSMatt Macy int nid = NUMA_NO_NODE; 280c9539b89SMartin Matuska unsigned int alloc_pages = 0; 281eda14cbcSMatt Macy 282eda14cbcSMatt Macy INIT_LIST_HEAD(&pages); 283eda14cbcSMatt Macy 284c9539b89SMartin Matuska ASSERT3U(alloc_pages, <, nr_pages); 285c9539b89SMartin Matuska 286eda14cbcSMatt Macy while (alloc_pages < nr_pages) { 287c9539b89SMartin Matuska unsigned int chunk_pages; 288c9539b89SMartin Matuska unsigned int order; 289eda14cbcSMatt Macy 290eda14cbcSMatt Macy order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); 291eda14cbcSMatt Macy chunk_pages = (1U << order); 292eda14cbcSMatt Macy 293eda14cbcSMatt Macy page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); 294eda14cbcSMatt Macy if (page == NULL) { 295eda14cbcSMatt Macy if (order == 0) { 296eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 297eda14cbcSMatt Macy schedule_timeout_interruptible(1); 298eda14cbcSMatt Macy } else { 299eda14cbcSMatt Macy max_order = MAX(0, order - 1); 300eda14cbcSMatt Macy } 301eda14cbcSMatt Macy continue; 302eda14cbcSMatt Macy } 303eda14cbcSMatt Macy 304eda14cbcSMatt Macy list_add_tail(&page->lru, &pages); 305eda14cbcSMatt Macy 306eda14cbcSMatt Macy if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) 307eda14cbcSMatt Macy zones++; 308eda14cbcSMatt Macy 309eda14cbcSMatt Macy nid = page_to_nid(page); 310eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_orders[order]); 311eda14cbcSMatt Macy chunks++; 312eda14cbcSMatt Macy alloc_pages += chunk_pages; 313eda14cbcSMatt Macy } 314eda14cbcSMatt Macy 315eda14cbcSMatt Macy ASSERT3S(alloc_pages, ==, nr_pages); 316eda14cbcSMatt Macy 317eda14cbcSMatt Macy while (sg_alloc_table(&table, chunks, gfp)) { 318eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 319eda14cbcSMatt Macy schedule_timeout_interruptible(1); 320eda14cbcSMatt Macy } 321eda14cbcSMatt Macy 322eda14cbcSMatt Macy sg = table.sgl; 323eda14cbcSMatt Macy remaining_size = size; 324eda14cbcSMatt Macy list_for_each_entry_safe(page, tmp_page, &pages, lru) { 325eda14cbcSMatt Macy size_t sg_size = MIN(PAGESIZE << compound_order(page), 326eda14cbcSMatt Macy remaining_size); 327eda14cbcSMatt Macy sg_set_page(sg, page, sg_size, 0); 328eda14cbcSMatt Macy abd_mark_zfs_page(page); 329eda14cbcSMatt Macy remaining_size -= sg_size; 330eda14cbcSMatt Macy 331eda14cbcSMatt Macy sg = sg_next(sg); 332eda14cbcSMatt Macy list_del(&page->lru); 333eda14cbcSMatt Macy } 334eda14cbcSMatt Macy 335eda14cbcSMatt Macy /* 336eda14cbcSMatt Macy * These conditions ensure that a possible transformation to a linear 337eda14cbcSMatt Macy * ABD would be valid. 338eda14cbcSMatt Macy */ 339eda14cbcSMatt Macy ASSERT(!PageHighMem(sg_page(table.sgl))); 340eda14cbcSMatt Macy ASSERT0(ABD_SCATTER(abd).abd_offset); 341eda14cbcSMatt Macy 342eda14cbcSMatt Macy if (table.nents == 1) { 343eda14cbcSMatt Macy /* 344eda14cbcSMatt Macy * Since there is only one entry, this ABD can be represented 345eda14cbcSMatt Macy * as a linear buffer. All single-page (4K) ABD's can be 346eda14cbcSMatt Macy * represented this way. Some multi-page ABD's can also be 347eda14cbcSMatt Macy * represented this way, if we were able to allocate a single 348eda14cbcSMatt Macy * "chunk" (higher-order "page" which represents a power-of-2 349eda14cbcSMatt Macy * series of physically-contiguous pages). This is often the 350eda14cbcSMatt Macy * case for 2-page (8K) ABD's. 351eda14cbcSMatt Macy * 352eda14cbcSMatt Macy * Representing a single-entry scatter ABD as a linear ABD 353eda14cbcSMatt Macy * has the performance advantage of avoiding the copy (and 354eda14cbcSMatt Macy * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. 355eda14cbcSMatt Macy * A performance increase of around 5% has been observed for 356eda14cbcSMatt Macy * ARC-cached reads (of small blocks which can take advantage 357eda14cbcSMatt Macy * of this). 358eda14cbcSMatt Macy * 359eda14cbcSMatt Macy * Note that this optimization is only possible because the 360eda14cbcSMatt Macy * pages are always mapped into the kernel's address space. 361eda14cbcSMatt Macy * This is not the case for highmem pages, so the 362eda14cbcSMatt Macy * optimization can not be made there. 363eda14cbcSMatt Macy */ 364eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_LINEAR; 365eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; 366eda14cbcSMatt Macy abd->abd_u.abd_linear.abd_sgl = table.sgl; 367eda14cbcSMatt Macy ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); 368eda14cbcSMatt Macy } else if (table.nents > 1) { 369eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 370eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 371eda14cbcSMatt Macy 372eda14cbcSMatt Macy if (zones) { 373eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); 374eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_ZONE; 375eda14cbcSMatt Macy } 376eda14cbcSMatt Macy 377eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = table.sgl; 378eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = table.nents; 379eda14cbcSMatt Macy } 380eda14cbcSMatt Macy } 381eda14cbcSMatt Macy #else 382eda14cbcSMatt Macy 383eda14cbcSMatt Macy /* 384eda14cbcSMatt Macy * Allocate N individual pages to construct a scatter ABD. This function 385eda14cbcSMatt Macy * makes no attempt to request contiguous pages and requires the minimal 386eda14cbcSMatt Macy * number of kernel interfaces. It's designed for maximum compatibility. 387eda14cbcSMatt Macy */ 388eda14cbcSMatt Macy void 389eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 390eda14cbcSMatt Macy { 391eda14cbcSMatt Macy struct scatterlist *sg = NULL; 392eda14cbcSMatt Macy struct sg_table table; 393eda14cbcSMatt Macy struct page *page; 394eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 395eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(size); 396eda14cbcSMatt Macy int i = 0; 397eda14cbcSMatt Macy 398eda14cbcSMatt Macy while (sg_alloc_table(&table, nr_pages, gfp)) { 399eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 400eda14cbcSMatt Macy schedule_timeout_interruptible(1); 401eda14cbcSMatt Macy } 402eda14cbcSMatt Macy 403eda14cbcSMatt Macy ASSERT3U(table.nents, ==, nr_pages); 404eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = table.sgl; 405eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = nr_pages; 406eda14cbcSMatt Macy 407eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 408eda14cbcSMatt Macy while ((page = __page_cache_alloc(gfp)) == NULL) { 409eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 410eda14cbcSMatt Macy schedule_timeout_interruptible(1); 411eda14cbcSMatt Macy } 412eda14cbcSMatt Macy 413eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_orders[0]); 414eda14cbcSMatt Macy sg_set_page(sg, page, PAGESIZE, 0); 415eda14cbcSMatt Macy abd_mark_zfs_page(page); 416eda14cbcSMatt Macy } 417eda14cbcSMatt Macy 418eda14cbcSMatt Macy if (nr_pages > 1) { 419eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 420eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 421eda14cbcSMatt Macy } 422eda14cbcSMatt Macy } 423eda14cbcSMatt Macy #endif /* !CONFIG_HIGHMEM */ 424eda14cbcSMatt Macy 425eda14cbcSMatt Macy /* 426eda14cbcSMatt Macy * This must be called if any of the sg_table allocation functions 427eda14cbcSMatt Macy * are called. 428eda14cbcSMatt Macy */ 429eda14cbcSMatt Macy static void 430eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd) 431eda14cbcSMatt Macy { 432eda14cbcSMatt Macy struct sg_table table; 433eda14cbcSMatt Macy 434eda14cbcSMatt Macy table.sgl = ABD_SCATTER(abd).abd_sgl; 435eda14cbcSMatt Macy table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; 436eda14cbcSMatt Macy sg_free_table(&table); 437eda14cbcSMatt Macy } 438eda14cbcSMatt Macy 439eda14cbcSMatt Macy void 440eda14cbcSMatt Macy abd_free_chunks(abd_t *abd) 441eda14cbcSMatt Macy { 442eda14cbcSMatt Macy struct scatterlist *sg = NULL; 443eda14cbcSMatt Macy struct page *page; 444eda14cbcSMatt Macy int nr_pages = ABD_SCATTER(abd).abd_nents; 445eda14cbcSMatt Macy int order, i = 0; 446eda14cbcSMatt Macy 447eda14cbcSMatt Macy if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) 448eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); 449eda14cbcSMatt Macy 450eda14cbcSMatt Macy if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) 451eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 452eda14cbcSMatt Macy 453eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 454eda14cbcSMatt Macy page = sg_page(sg); 455eda14cbcSMatt Macy abd_unmark_zfs_page(page); 456eda14cbcSMatt Macy order = compound_order(page); 457eda14cbcSMatt Macy __free_pages(page, order); 458eda14cbcSMatt Macy ASSERT3U(sg->length, <=, PAGE_SIZE << order); 459eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); 460eda14cbcSMatt Macy } 461eda14cbcSMatt Macy abd_free_sg_table(abd); 462eda14cbcSMatt Macy } 463eda14cbcSMatt Macy 464eda14cbcSMatt Macy /* 465eda14cbcSMatt Macy * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in 466eda14cbcSMatt Macy * the scatterlist will be set to the zero'd out buffer abd_zero_page. 467eda14cbcSMatt Macy */ 468eda14cbcSMatt Macy static void 469eda14cbcSMatt Macy abd_alloc_zero_scatter(void) 470eda14cbcSMatt Macy { 471eda14cbcSMatt Macy struct scatterlist *sg = NULL; 472eda14cbcSMatt Macy struct sg_table table; 473eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 474eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 475eda14cbcSMatt Macy int i = 0; 476eda14cbcSMatt Macy 477da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 478da5137abSMartin Matuska gfp_t gfp_zero_page = gfp | __GFP_ZERO; 479eda14cbcSMatt Macy while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { 480eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 481eda14cbcSMatt Macy schedule_timeout_interruptible(1); 482eda14cbcSMatt Macy } 483eda14cbcSMatt Macy abd_mark_zfs_page(abd_zero_page); 484da5137abSMartin Matuska #else 485da5137abSMartin Matuska abd_zero_page = ZERO_PAGE(0); 486da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 487eda14cbcSMatt Macy 488eda14cbcSMatt Macy while (sg_alloc_table(&table, nr_pages, gfp)) { 489eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 490eda14cbcSMatt Macy schedule_timeout_interruptible(1); 491eda14cbcSMatt Macy } 492eda14cbcSMatt Macy ASSERT3U(table.nents, ==, nr_pages); 493eda14cbcSMatt Macy 494eda14cbcSMatt Macy abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 495184c1b94SMartin Matuska abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 496eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 497eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; 498eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 499eda14cbcSMatt Macy abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 500eda14cbcSMatt Macy abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 501eda14cbcSMatt Macy 502eda14cbcSMatt Macy abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 503eda14cbcSMatt Macy sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 504eda14cbcSMatt Macy } 505eda14cbcSMatt Macy 506eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 507eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 508eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 509eda14cbcSMatt Macy } 510eda14cbcSMatt Macy 511eda14cbcSMatt Macy #else /* _KERNEL */ 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy #ifndef PAGE_SHIFT 514eda14cbcSMatt Macy #define PAGE_SHIFT (highbit64(PAGESIZE)-1) 515eda14cbcSMatt Macy #endif 516eda14cbcSMatt Macy 517f9693befSMartin Matuska #define zfs_kmap_atomic(chunk) ((void *)chunk) 518f9693befSMartin Matuska #define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) 519eda14cbcSMatt Macy #define local_irq_save(flags) do { (void)(flags); } while (0) 520eda14cbcSMatt Macy #define local_irq_restore(flags) do { (void)(flags); } while (0) 521eda14cbcSMatt Macy #define nth_page(pg, i) \ 522eda14cbcSMatt Macy ((struct page *)((void *)(pg) + (i) * PAGESIZE)) 523eda14cbcSMatt Macy 524eda14cbcSMatt Macy struct scatterlist { 525eda14cbcSMatt Macy struct page *page; 526eda14cbcSMatt Macy int length; 527eda14cbcSMatt Macy int end; 528eda14cbcSMatt Macy }; 529eda14cbcSMatt Macy 530eda14cbcSMatt Macy static void 531eda14cbcSMatt Macy sg_init_table(struct scatterlist *sg, int nr) 532eda14cbcSMatt Macy { 533eda14cbcSMatt Macy memset(sg, 0, nr * sizeof (struct scatterlist)); 534eda14cbcSMatt Macy sg[nr - 1].end = 1; 535eda14cbcSMatt Macy } 536eda14cbcSMatt Macy 537eda14cbcSMatt Macy /* 538eda14cbcSMatt Macy * This must be called if any of the sg_table allocation functions 539eda14cbcSMatt Macy * are called. 540eda14cbcSMatt Macy */ 541eda14cbcSMatt Macy static void 542eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd) 543eda14cbcSMatt Macy { 544eda14cbcSMatt Macy int nents = ABD_SCATTER(abd).abd_nents; 545eda14cbcSMatt Macy vmem_free(ABD_SCATTER(abd).abd_sgl, 546eda14cbcSMatt Macy nents * sizeof (struct scatterlist)); 547eda14cbcSMatt Macy } 548eda14cbcSMatt Macy 549eda14cbcSMatt Macy #define for_each_sg(sgl, sg, nr, i) \ 550eda14cbcSMatt Macy for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) 551eda14cbcSMatt Macy 552eda14cbcSMatt Macy static inline void 553eda14cbcSMatt Macy sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, 554eda14cbcSMatt Macy unsigned int offset) 555eda14cbcSMatt Macy { 556eda14cbcSMatt Macy /* currently we don't use offset */ 557eda14cbcSMatt Macy ASSERT(offset == 0); 558eda14cbcSMatt Macy sg->page = page; 559eda14cbcSMatt Macy sg->length = len; 560eda14cbcSMatt Macy } 561eda14cbcSMatt Macy 562eda14cbcSMatt Macy static inline struct page * 563eda14cbcSMatt Macy sg_page(struct scatterlist *sg) 564eda14cbcSMatt Macy { 565eda14cbcSMatt Macy return (sg->page); 566eda14cbcSMatt Macy } 567eda14cbcSMatt Macy 568eda14cbcSMatt Macy static inline struct scatterlist * 569eda14cbcSMatt Macy sg_next(struct scatterlist *sg) 570eda14cbcSMatt Macy { 571eda14cbcSMatt Macy if (sg->end) 572eda14cbcSMatt Macy return (NULL); 573eda14cbcSMatt Macy 574eda14cbcSMatt Macy return (sg + 1); 575eda14cbcSMatt Macy } 576eda14cbcSMatt Macy 577eda14cbcSMatt Macy void 578eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 579eda14cbcSMatt Macy { 580eda14cbcSMatt Macy unsigned nr_pages = abd_chunkcnt_for_bytes(size); 581eda14cbcSMatt Macy struct scatterlist *sg; 582eda14cbcSMatt Macy int i; 583eda14cbcSMatt Macy 584eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * 585eda14cbcSMatt Macy sizeof (struct scatterlist), KM_SLEEP); 586eda14cbcSMatt Macy sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); 587eda14cbcSMatt Macy 588eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 589eda14cbcSMatt Macy struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 590eda14cbcSMatt Macy sg_set_page(sg, p, PAGESIZE, 0); 591eda14cbcSMatt Macy } 592eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = nr_pages; 593eda14cbcSMatt Macy } 594eda14cbcSMatt Macy 595eda14cbcSMatt Macy void 596eda14cbcSMatt Macy abd_free_chunks(abd_t *abd) 597eda14cbcSMatt Macy { 598eda14cbcSMatt Macy int i, n = ABD_SCATTER(abd).abd_nents; 599eda14cbcSMatt Macy struct scatterlist *sg; 600eda14cbcSMatt Macy 601eda14cbcSMatt Macy abd_for_each_sg(abd, sg, n, i) { 602dbd5678dSMartin Matuska struct page *p = nth_page(sg_page(sg), 0); 603dbd5678dSMartin Matuska umem_free_aligned(p, PAGESIZE); 604eda14cbcSMatt Macy } 605eda14cbcSMatt Macy abd_free_sg_table(abd); 606eda14cbcSMatt Macy } 607eda14cbcSMatt Macy 608eda14cbcSMatt Macy static void 609eda14cbcSMatt Macy abd_alloc_zero_scatter(void) 610eda14cbcSMatt Macy { 611eda14cbcSMatt Macy unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 612eda14cbcSMatt Macy struct scatterlist *sg; 613eda14cbcSMatt Macy int i; 614eda14cbcSMatt Macy 615eda14cbcSMatt Macy abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 616eda14cbcSMatt Macy memset(abd_zero_page, 0, PAGESIZE); 617eda14cbcSMatt Macy abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 618184c1b94SMartin Matuska abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 619eda14cbcSMatt Macy abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 620eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 621eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 622eda14cbcSMatt Macy abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 623eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * 624eda14cbcSMatt Macy sizeof (struct scatterlist), KM_SLEEP); 625eda14cbcSMatt Macy 626eda14cbcSMatt Macy sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); 627eda14cbcSMatt Macy 628eda14cbcSMatt Macy abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 629eda14cbcSMatt Macy sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 630eda14cbcSMatt Macy } 631eda14cbcSMatt Macy 632eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 633eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 634eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 635eda14cbcSMatt Macy } 636eda14cbcSMatt Macy 637eda14cbcSMatt Macy #endif /* _KERNEL */ 638eda14cbcSMatt Macy 639eda14cbcSMatt Macy boolean_t 640eda14cbcSMatt Macy abd_size_alloc_linear(size_t size) 641eda14cbcSMatt Macy { 6421f88aa09SMartin Matuska return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); 643eda14cbcSMatt Macy } 644eda14cbcSMatt Macy 645eda14cbcSMatt Macy void 646eda14cbcSMatt Macy abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) 647eda14cbcSMatt Macy { 648eda14cbcSMatt Macy ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 649eda14cbcSMatt Macy int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; 650eda14cbcSMatt Macy if (op == ABDSTAT_INCR) { 651eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 652eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); 653eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); 654eda14cbcSMatt Macy arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); 655eda14cbcSMatt Macy } else { 656eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 657eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 658eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); 659eda14cbcSMatt Macy arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); 660eda14cbcSMatt Macy } 661eda14cbcSMatt Macy } 662eda14cbcSMatt Macy 663eda14cbcSMatt Macy void 664eda14cbcSMatt Macy abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) 665eda14cbcSMatt Macy { 666eda14cbcSMatt Macy ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 667eda14cbcSMatt Macy if (op == ABDSTAT_INCR) { 668eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_linear_cnt); 669eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 670eda14cbcSMatt Macy } else { 671eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 672eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 673eda14cbcSMatt Macy } 674eda14cbcSMatt Macy } 675eda14cbcSMatt Macy 676eda14cbcSMatt Macy void 677eda14cbcSMatt Macy abd_verify_scatter(abd_t *abd) 678eda14cbcSMatt Macy { 679eda14cbcSMatt Macy size_t n; 680eda14cbcSMatt Macy int i = 0; 681eda14cbcSMatt Macy struct scatterlist *sg = NULL; 682eda14cbcSMatt Macy 683eda14cbcSMatt Macy ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); 684eda14cbcSMatt Macy ASSERT3U(ABD_SCATTER(abd).abd_offset, <, 685eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl->length); 686eda14cbcSMatt Macy n = ABD_SCATTER(abd).abd_nents; 687eda14cbcSMatt Macy abd_for_each_sg(abd, sg, n, i) { 688eda14cbcSMatt Macy ASSERT3P(sg_page(sg), !=, NULL); 689eda14cbcSMatt Macy } 690eda14cbcSMatt Macy } 691eda14cbcSMatt Macy 692eda14cbcSMatt Macy static void 693eda14cbcSMatt Macy abd_free_zero_scatter(void) 694eda14cbcSMatt Macy { 695eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 696eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); 697eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 698eda14cbcSMatt Macy 699eda14cbcSMatt Macy abd_free_sg_table(abd_zero_scatter); 700eda14cbcSMatt Macy abd_free_struct(abd_zero_scatter); 701eda14cbcSMatt Macy abd_zero_scatter = NULL; 702eda14cbcSMatt Macy ASSERT3P(abd_zero_page, !=, NULL); 703eda14cbcSMatt Macy #if defined(_KERNEL) 704da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 705eda14cbcSMatt Macy abd_unmark_zfs_page(abd_zero_page); 706eda14cbcSMatt Macy __free_page(abd_zero_page); 707da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 708eda14cbcSMatt Macy #else 709dbd5678dSMartin Matuska umem_free_aligned(abd_zero_page, PAGESIZE); 710eda14cbcSMatt Macy #endif /* _KERNEL */ 711eda14cbcSMatt Macy } 712eda14cbcSMatt Macy 7130d8fe237SMartin Matuska static int 7140d8fe237SMartin Matuska abd_kstats_update(kstat_t *ksp, int rw) 7150d8fe237SMartin Matuska { 7160d8fe237SMartin Matuska abd_stats_t *as = ksp->ks_data; 7170d8fe237SMartin Matuska 7180d8fe237SMartin Matuska if (rw == KSTAT_WRITE) 7190d8fe237SMartin Matuska return (EACCES); 7200d8fe237SMartin Matuska as->abdstat_struct_size.value.ui64 = 7210d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_struct_size); 7220d8fe237SMartin Matuska as->abdstat_linear_cnt.value.ui64 = 7230d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_linear_cnt); 7240d8fe237SMartin Matuska as->abdstat_linear_data_size.value.ui64 = 7250d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_linear_data_size); 7260d8fe237SMartin Matuska as->abdstat_scatter_cnt.value.ui64 = 7270d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_cnt); 7280d8fe237SMartin Matuska as->abdstat_scatter_data_size.value.ui64 = 7290d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_data_size); 7300d8fe237SMartin Matuska as->abdstat_scatter_chunk_waste.value.ui64 = 7310d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); 7320d8fe237SMartin Matuska for (int i = 0; i < MAX_ORDER; i++) { 7330d8fe237SMartin Matuska as->abdstat_scatter_orders[i].value.ui64 = 7340d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_orders[i]); 7350d8fe237SMartin Matuska } 7360d8fe237SMartin Matuska as->abdstat_scatter_page_multi_chunk.value.ui64 = 7370d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); 7380d8fe237SMartin Matuska as->abdstat_scatter_page_multi_zone.value.ui64 = 7390d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); 7400d8fe237SMartin Matuska as->abdstat_scatter_page_alloc_retry.value.ui64 = 7410d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); 7420d8fe237SMartin Matuska as->abdstat_scatter_sg_table_retry.value.ui64 = 7430d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); 7440d8fe237SMartin Matuska return (0); 7450d8fe237SMartin Matuska } 7460d8fe237SMartin Matuska 747eda14cbcSMatt Macy void 748eda14cbcSMatt Macy abd_init(void) 749eda14cbcSMatt Macy { 750eda14cbcSMatt Macy int i; 751eda14cbcSMatt Macy 752eda14cbcSMatt Macy abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 753eda14cbcSMatt Macy 0, NULL, NULL, NULL, NULL, NULL, 0); 754eda14cbcSMatt Macy 7550d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_struct_size, 0); 7560d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_linear_cnt, 0); 7570d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_linear_data_size, 0); 7580d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); 7590d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); 7600d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); 7610d8fe237SMartin Matuska for (i = 0; i < MAX_ORDER; i++) 7620d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); 7630d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); 7640d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); 7650d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); 7660d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); 7670d8fe237SMartin Matuska 768eda14cbcSMatt Macy abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 769eda14cbcSMatt Macy sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 770eda14cbcSMatt Macy if (abd_ksp != NULL) { 771eda14cbcSMatt Macy for (i = 0; i < MAX_ORDER; i++) { 772eda14cbcSMatt Macy snprintf(abd_stats.abdstat_scatter_orders[i].name, 773eda14cbcSMatt Macy KSTAT_STRLEN, "scatter_order_%d", i); 774eda14cbcSMatt Macy abd_stats.abdstat_scatter_orders[i].data_type = 775eda14cbcSMatt Macy KSTAT_DATA_UINT64; 776eda14cbcSMatt Macy } 777eda14cbcSMatt Macy abd_ksp->ks_data = &abd_stats; 7780d8fe237SMartin Matuska abd_ksp->ks_update = abd_kstats_update; 779eda14cbcSMatt Macy kstat_install(abd_ksp); 780eda14cbcSMatt Macy } 781eda14cbcSMatt Macy 782eda14cbcSMatt Macy abd_alloc_zero_scatter(); 783eda14cbcSMatt Macy } 784eda14cbcSMatt Macy 785eda14cbcSMatt Macy void 786eda14cbcSMatt Macy abd_fini(void) 787eda14cbcSMatt Macy { 788eda14cbcSMatt Macy abd_free_zero_scatter(); 789eda14cbcSMatt Macy 790eda14cbcSMatt Macy if (abd_ksp != NULL) { 791eda14cbcSMatt Macy kstat_delete(abd_ksp); 792eda14cbcSMatt Macy abd_ksp = NULL; 793eda14cbcSMatt Macy } 794eda14cbcSMatt Macy 7950d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_struct_size); 7960d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_linear_cnt); 7970d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_linear_data_size); 7980d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_cnt); 7990d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_data_size); 8000d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); 8010d8fe237SMartin Matuska for (int i = 0; i < MAX_ORDER; i++) 8020d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); 8030d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); 8040d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); 8050d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); 8060d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); 8070d8fe237SMartin Matuska 808eda14cbcSMatt Macy if (abd_cache) { 809eda14cbcSMatt Macy kmem_cache_destroy(abd_cache); 810eda14cbcSMatt Macy abd_cache = NULL; 811eda14cbcSMatt Macy } 812eda14cbcSMatt Macy } 813eda14cbcSMatt Macy 814eda14cbcSMatt Macy void 815eda14cbcSMatt Macy abd_free_linear_page(abd_t *abd) 816eda14cbcSMatt Macy { 817eda14cbcSMatt Macy /* Transform it back into a scatter ABD for freeing */ 818eda14cbcSMatt Macy struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; 819eda14cbcSMatt Macy abd->abd_flags &= ~ABD_FLAG_LINEAR; 820eda14cbcSMatt Macy abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; 821eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = 1; 822eda14cbcSMatt Macy ABD_SCATTER(abd).abd_offset = 0; 823eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = sg; 824eda14cbcSMatt Macy abd_free_chunks(abd); 825eda14cbcSMatt Macy 826eda14cbcSMatt Macy abd_update_scatter_stats(abd, ABDSTAT_DECR); 827eda14cbcSMatt Macy } 828eda14cbcSMatt Macy 829eda14cbcSMatt Macy /* 830eda14cbcSMatt Macy * If we're going to use this ABD for doing I/O using the block layer, the 831eda14cbcSMatt Macy * consumer of the ABD data doesn't care if it's scattered or not, and we don't 832eda14cbcSMatt Macy * plan to store this ABD in memory for a long period of time, we should 833eda14cbcSMatt Macy * allocate the ABD type that requires the least data copying to do the I/O. 834eda14cbcSMatt Macy * 835eda14cbcSMatt Macy * On Linux the optimal thing to do would be to use abd_get_offset() and 836eda14cbcSMatt Macy * construct a new ABD which shares the original pages thereby eliminating 837eda14cbcSMatt Macy * the copy. But for the moment a new linear ABD is allocated until this 838eda14cbcSMatt Macy * performance optimization can be implemented. 839eda14cbcSMatt Macy */ 840eda14cbcSMatt Macy abd_t * 841eda14cbcSMatt Macy abd_alloc_for_io(size_t size, boolean_t is_metadata) 842eda14cbcSMatt Macy { 843eda14cbcSMatt Macy return (abd_alloc(size, is_metadata)); 844eda14cbcSMatt Macy } 845eda14cbcSMatt Macy 846eda14cbcSMatt Macy abd_t * 8477cd22ac4SMartin Matuska abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, 8487cd22ac4SMartin Matuska size_t size) 849eda14cbcSMatt Macy { 850e92ffd9bSMartin Matuska (void) size; 851eda14cbcSMatt Macy int i = 0; 852eda14cbcSMatt Macy struct scatterlist *sg = NULL; 853eda14cbcSMatt Macy 854eda14cbcSMatt Macy abd_verify(sabd); 855eda14cbcSMatt Macy ASSERT3U(off, <=, sabd->abd_size); 856eda14cbcSMatt Macy 857eda14cbcSMatt Macy size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; 858eda14cbcSMatt Macy 859184c1b94SMartin Matuska if (abd == NULL) 860eda14cbcSMatt Macy abd = abd_alloc_struct(0); 861eda14cbcSMatt Macy 862eda14cbcSMatt Macy /* 863eda14cbcSMatt Macy * Even if this buf is filesystem metadata, we only track that 864eda14cbcSMatt Macy * if we own the underlying data buffer, which is not true in 865eda14cbcSMatt Macy * this case. Therefore, we don't ever use ABD_FLAG_META here. 866eda14cbcSMatt Macy */ 867eda14cbcSMatt Macy 868eda14cbcSMatt Macy abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { 869eda14cbcSMatt Macy if (new_offset < sg->length) 870eda14cbcSMatt Macy break; 871eda14cbcSMatt Macy new_offset -= sg->length; 872eda14cbcSMatt Macy } 873eda14cbcSMatt Macy 874eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = sg; 875eda14cbcSMatt Macy ABD_SCATTER(abd).abd_offset = new_offset; 876eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; 877eda14cbcSMatt Macy 878eda14cbcSMatt Macy return (abd); 879eda14cbcSMatt Macy } 880eda14cbcSMatt Macy 881eda14cbcSMatt Macy /* 882eda14cbcSMatt Macy * Initialize the abd_iter. 883eda14cbcSMatt Macy */ 884eda14cbcSMatt Macy void 885eda14cbcSMatt Macy abd_iter_init(struct abd_iter *aiter, abd_t *abd) 886eda14cbcSMatt Macy { 887eda14cbcSMatt Macy ASSERT(!abd_is_gang(abd)); 888eda14cbcSMatt Macy abd_verify(abd); 889eda14cbcSMatt Macy aiter->iter_abd = abd; 890eda14cbcSMatt Macy aiter->iter_mapaddr = NULL; 891eda14cbcSMatt Macy aiter->iter_mapsize = 0; 892eda14cbcSMatt Macy aiter->iter_pos = 0; 893eda14cbcSMatt Macy if (abd_is_linear(abd)) { 894eda14cbcSMatt Macy aiter->iter_offset = 0; 895eda14cbcSMatt Macy aiter->iter_sg = NULL; 896eda14cbcSMatt Macy } else { 897eda14cbcSMatt Macy aiter->iter_offset = ABD_SCATTER(abd).abd_offset; 898eda14cbcSMatt Macy aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; 899eda14cbcSMatt Macy } 900eda14cbcSMatt Macy } 901eda14cbcSMatt Macy 902eda14cbcSMatt Macy /* 903eda14cbcSMatt Macy * This is just a helper function to see if we have exhausted the 904eda14cbcSMatt Macy * abd_iter and reached the end. 905eda14cbcSMatt Macy */ 906eda14cbcSMatt Macy boolean_t 907eda14cbcSMatt Macy abd_iter_at_end(struct abd_iter *aiter) 908eda14cbcSMatt Macy { 909eda14cbcSMatt Macy return (aiter->iter_pos == aiter->iter_abd->abd_size); 910eda14cbcSMatt Macy } 911eda14cbcSMatt Macy 912eda14cbcSMatt Macy /* 913eda14cbcSMatt Macy * Advance the iterator by a certain amount. Cannot be called when a chunk is 914eda14cbcSMatt Macy * in use. This can be safely called when the aiter has already exhausted, in 915eda14cbcSMatt Macy * which case this does nothing. 916eda14cbcSMatt Macy */ 917eda14cbcSMatt Macy void 918eda14cbcSMatt Macy abd_iter_advance(struct abd_iter *aiter, size_t amount) 919eda14cbcSMatt Macy { 920eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, ==, NULL); 921eda14cbcSMatt Macy ASSERT0(aiter->iter_mapsize); 922eda14cbcSMatt Macy 923eda14cbcSMatt Macy /* There's nothing left to advance to, so do nothing */ 924eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 925eda14cbcSMatt Macy return; 926eda14cbcSMatt Macy 927eda14cbcSMatt Macy aiter->iter_pos += amount; 928eda14cbcSMatt Macy aiter->iter_offset += amount; 929eda14cbcSMatt Macy if (!abd_is_linear(aiter->iter_abd)) { 930eda14cbcSMatt Macy while (aiter->iter_offset >= aiter->iter_sg->length) { 931eda14cbcSMatt Macy aiter->iter_offset -= aiter->iter_sg->length; 932eda14cbcSMatt Macy aiter->iter_sg = sg_next(aiter->iter_sg); 933eda14cbcSMatt Macy if (aiter->iter_sg == NULL) { 934eda14cbcSMatt Macy ASSERT0(aiter->iter_offset); 935eda14cbcSMatt Macy break; 936eda14cbcSMatt Macy } 937eda14cbcSMatt Macy } 938eda14cbcSMatt Macy } 939eda14cbcSMatt Macy } 940eda14cbcSMatt Macy 941eda14cbcSMatt Macy /* 942eda14cbcSMatt Macy * Map the current chunk into aiter. This can be safely called when the aiter 943eda14cbcSMatt Macy * has already exhausted, in which case this does nothing. 944eda14cbcSMatt Macy */ 945eda14cbcSMatt Macy void 946eda14cbcSMatt Macy abd_iter_map(struct abd_iter *aiter) 947eda14cbcSMatt Macy { 948eda14cbcSMatt Macy void *paddr; 949eda14cbcSMatt Macy size_t offset = 0; 950eda14cbcSMatt Macy 951eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, ==, NULL); 952eda14cbcSMatt Macy ASSERT0(aiter->iter_mapsize); 953eda14cbcSMatt Macy 954eda14cbcSMatt Macy /* There's nothing left to iterate over, so do nothing */ 955eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 956eda14cbcSMatt Macy return; 957eda14cbcSMatt Macy 958eda14cbcSMatt Macy if (abd_is_linear(aiter->iter_abd)) { 959eda14cbcSMatt Macy ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 960eda14cbcSMatt Macy offset = aiter->iter_offset; 961eda14cbcSMatt Macy aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 962eda14cbcSMatt Macy paddr = ABD_LINEAR_BUF(aiter->iter_abd); 963eda14cbcSMatt Macy } else { 964eda14cbcSMatt Macy offset = aiter->iter_offset; 965eda14cbcSMatt Macy aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, 966eda14cbcSMatt Macy aiter->iter_abd->abd_size - aiter->iter_pos); 967eda14cbcSMatt Macy 968f9693befSMartin Matuska paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg)); 969eda14cbcSMatt Macy } 970eda14cbcSMatt Macy 971eda14cbcSMatt Macy aiter->iter_mapaddr = (char *)paddr + offset; 972eda14cbcSMatt Macy } 973eda14cbcSMatt Macy 974eda14cbcSMatt Macy /* 975eda14cbcSMatt Macy * Unmap the current chunk from aiter. This can be safely called when the aiter 976eda14cbcSMatt Macy * has already exhausted, in which case this does nothing. 977eda14cbcSMatt Macy */ 978eda14cbcSMatt Macy void 979eda14cbcSMatt Macy abd_iter_unmap(struct abd_iter *aiter) 980eda14cbcSMatt Macy { 981eda14cbcSMatt Macy /* There's nothing left to unmap, so do nothing */ 982eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 983eda14cbcSMatt Macy return; 984eda14cbcSMatt Macy 985eda14cbcSMatt Macy if (!abd_is_linear(aiter->iter_abd)) { 986eda14cbcSMatt Macy /* LINTED E_FUNC_SET_NOT_USED */ 987f9693befSMartin Matuska zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset); 988eda14cbcSMatt Macy } 989eda14cbcSMatt Macy 990eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, !=, NULL); 991eda14cbcSMatt Macy ASSERT3U(aiter->iter_mapsize, >, 0); 992eda14cbcSMatt Macy 993eda14cbcSMatt Macy aiter->iter_mapaddr = NULL; 994eda14cbcSMatt Macy aiter->iter_mapsize = 0; 995eda14cbcSMatt Macy } 996eda14cbcSMatt Macy 997eda14cbcSMatt Macy void 998eda14cbcSMatt Macy abd_cache_reap_now(void) 999eda14cbcSMatt Macy { 1000eda14cbcSMatt Macy } 1001eda14cbcSMatt Macy 1002eda14cbcSMatt Macy #if defined(_KERNEL) 1003eda14cbcSMatt Macy /* 1004eda14cbcSMatt Macy * bio_nr_pages for ABD. 1005eda14cbcSMatt Macy * @off is the offset in @abd 1006eda14cbcSMatt Macy */ 1007eda14cbcSMatt Macy unsigned long 1008eda14cbcSMatt Macy abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) 1009eda14cbcSMatt Macy { 1010eda14cbcSMatt Macy unsigned long pos; 1011eda14cbcSMatt Macy 1012184c1b94SMartin Matuska if (abd_is_gang(abd)) { 1013184c1b94SMartin Matuska unsigned long count = 0; 1014eda14cbcSMatt Macy 1015184c1b94SMartin Matuska for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1016184c1b94SMartin Matuska cabd != NULL && size != 0; 1017184c1b94SMartin Matuska cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1018184c1b94SMartin Matuska ASSERT3U(off, <, cabd->abd_size); 1019184c1b94SMartin Matuska int mysize = MIN(size, cabd->abd_size - off); 1020184c1b94SMartin Matuska count += abd_nr_pages_off(cabd, mysize, off); 1021184c1b94SMartin Matuska size -= mysize; 1022184c1b94SMartin Matuska off = 0; 1023184c1b94SMartin Matuska } 1024184c1b94SMartin Matuska return (count); 1025184c1b94SMartin Matuska } 1026184c1b94SMartin Matuska 1027eda14cbcSMatt Macy if (abd_is_linear(abd)) 1028eda14cbcSMatt Macy pos = (unsigned long)abd_to_buf(abd) + off; 1029eda14cbcSMatt Macy else 1030eda14cbcSMatt Macy pos = ABD_SCATTER(abd).abd_offset + off; 1031eda14cbcSMatt Macy 1032184c1b94SMartin Matuska return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - 1033184c1b94SMartin Matuska (pos >> PAGE_SHIFT)); 1034eda14cbcSMatt Macy } 1035eda14cbcSMatt Macy 1036eda14cbcSMatt Macy static unsigned int 1037eda14cbcSMatt Macy bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) 1038eda14cbcSMatt Macy { 1039eda14cbcSMatt Macy unsigned int offset, size, i; 1040eda14cbcSMatt Macy struct page *page; 1041eda14cbcSMatt Macy 1042eda14cbcSMatt Macy offset = offset_in_page(buf_ptr); 1043eda14cbcSMatt Macy for (i = 0; i < bio->bi_max_vecs; i++) { 1044eda14cbcSMatt Macy size = PAGE_SIZE - offset; 1045eda14cbcSMatt Macy 1046eda14cbcSMatt Macy if (bio_size <= 0) 1047eda14cbcSMatt Macy break; 1048eda14cbcSMatt Macy 1049eda14cbcSMatt Macy if (size > bio_size) 1050eda14cbcSMatt Macy size = bio_size; 1051eda14cbcSMatt Macy 1052eda14cbcSMatt Macy if (is_vmalloc_addr(buf_ptr)) 1053eda14cbcSMatt Macy page = vmalloc_to_page(buf_ptr); 1054eda14cbcSMatt Macy else 1055eda14cbcSMatt Macy page = virt_to_page(buf_ptr); 1056eda14cbcSMatt Macy 1057eda14cbcSMatt Macy /* 1058eda14cbcSMatt Macy * Some network related block device uses tcp_sendpage, which 1059eda14cbcSMatt Macy * doesn't behave well when using 0-count page, this is a 1060eda14cbcSMatt Macy * safety net to catch them. 1061eda14cbcSMatt Macy */ 1062eda14cbcSMatt Macy ASSERT3S(page_count(page), >, 0); 1063eda14cbcSMatt Macy 1064eda14cbcSMatt Macy if (bio_add_page(bio, page, size, offset) != size) 1065eda14cbcSMatt Macy break; 1066eda14cbcSMatt Macy 1067eda14cbcSMatt Macy buf_ptr += size; 1068eda14cbcSMatt Macy bio_size -= size; 1069eda14cbcSMatt Macy offset = 0; 1070eda14cbcSMatt Macy } 1071eda14cbcSMatt Macy 1072eda14cbcSMatt Macy return (bio_size); 1073eda14cbcSMatt Macy } 1074eda14cbcSMatt Macy 1075eda14cbcSMatt Macy /* 1076eda14cbcSMatt Macy * bio_map for gang ABD. 1077eda14cbcSMatt Macy */ 1078eda14cbcSMatt Macy static unsigned int 1079eda14cbcSMatt Macy abd_gang_bio_map_off(struct bio *bio, abd_t *abd, 1080eda14cbcSMatt Macy unsigned int io_size, size_t off) 1081eda14cbcSMatt Macy { 1082eda14cbcSMatt Macy ASSERT(abd_is_gang(abd)); 1083eda14cbcSMatt Macy 1084eda14cbcSMatt Macy for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1085eda14cbcSMatt Macy cabd != NULL; 1086eda14cbcSMatt Macy cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1087eda14cbcSMatt Macy ASSERT3U(off, <, cabd->abd_size); 1088eda14cbcSMatt Macy int size = MIN(io_size, cabd->abd_size - off); 1089eda14cbcSMatt Macy int remainder = abd_bio_map_off(bio, cabd, size, off); 1090eda14cbcSMatt Macy io_size -= (size - remainder); 1091eda14cbcSMatt Macy if (io_size == 0 || remainder > 0) 1092eda14cbcSMatt Macy return (io_size); 1093eda14cbcSMatt Macy off = 0; 1094eda14cbcSMatt Macy } 1095eda14cbcSMatt Macy ASSERT0(io_size); 1096eda14cbcSMatt Macy return (io_size); 1097eda14cbcSMatt Macy } 1098eda14cbcSMatt Macy 1099eda14cbcSMatt Macy /* 1100eda14cbcSMatt Macy * bio_map for ABD. 1101eda14cbcSMatt Macy * @off is the offset in @abd 1102eda14cbcSMatt Macy * Remaining IO size is returned 1103eda14cbcSMatt Macy */ 1104eda14cbcSMatt Macy unsigned int 1105eda14cbcSMatt Macy abd_bio_map_off(struct bio *bio, abd_t *abd, 1106eda14cbcSMatt Macy unsigned int io_size, size_t off) 1107eda14cbcSMatt Macy { 1108eda14cbcSMatt Macy struct abd_iter aiter; 1109eda14cbcSMatt Macy 1110eda14cbcSMatt Macy ASSERT3U(io_size, <=, abd->abd_size - off); 1111eda14cbcSMatt Macy if (abd_is_linear(abd)) 1112eda14cbcSMatt Macy return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); 1113eda14cbcSMatt Macy 1114eda14cbcSMatt Macy ASSERT(!abd_is_linear(abd)); 1115eda14cbcSMatt Macy if (abd_is_gang(abd)) 1116eda14cbcSMatt Macy return (abd_gang_bio_map_off(bio, abd, io_size, off)); 1117eda14cbcSMatt Macy 1118eda14cbcSMatt Macy abd_iter_init(&aiter, abd); 1119eda14cbcSMatt Macy abd_iter_advance(&aiter, off); 1120eda14cbcSMatt Macy 1121184c1b94SMartin Matuska for (int i = 0; i < bio->bi_max_vecs; i++) { 1122eda14cbcSMatt Macy struct page *pg; 1123eda14cbcSMatt Macy size_t len, sgoff, pgoff; 1124eda14cbcSMatt Macy struct scatterlist *sg; 1125eda14cbcSMatt Macy 1126eda14cbcSMatt Macy if (io_size <= 0) 1127eda14cbcSMatt Macy break; 1128eda14cbcSMatt Macy 1129eda14cbcSMatt Macy sg = aiter.iter_sg; 1130eda14cbcSMatt Macy sgoff = aiter.iter_offset; 1131eda14cbcSMatt Macy pgoff = sgoff & (PAGESIZE - 1); 1132eda14cbcSMatt Macy len = MIN(io_size, PAGESIZE - pgoff); 1133eda14cbcSMatt Macy ASSERT(len > 0); 1134eda14cbcSMatt Macy 1135eda14cbcSMatt Macy pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); 1136eda14cbcSMatt Macy if (bio_add_page(bio, pg, len, pgoff) != len) 1137eda14cbcSMatt Macy break; 1138eda14cbcSMatt Macy 1139eda14cbcSMatt Macy io_size -= len; 1140eda14cbcSMatt Macy abd_iter_advance(&aiter, len); 1141eda14cbcSMatt Macy } 1142eda14cbcSMatt Macy 1143eda14cbcSMatt Macy return (io_size); 1144eda14cbcSMatt Macy } 1145eda14cbcSMatt Macy 1146eda14cbcSMatt Macy /* Tunable Parameters */ 1147eda14cbcSMatt Macy module_param(zfs_abd_scatter_enabled, int, 0644); 1148eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_enabled, 1149eda14cbcSMatt Macy "Toggle whether ABD allocations must be linear."); 1150eda14cbcSMatt Macy module_param(zfs_abd_scatter_min_size, int, 0644); 1151eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_min_size, 1152eda14cbcSMatt Macy "Minimum size of scatter allocations."); 1153eda14cbcSMatt Macy /* CSTYLED */ 1154eda14cbcSMatt Macy module_param(zfs_abd_scatter_max_order, uint, 0644); 1155eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_max_order, 1156eda14cbcSMatt Macy "Maximum order allocation used for a scatter ABD."); 1157eda14cbcSMatt Macy #endif 1158