1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 23eda14cbcSMatt Macy * Copyright (c) 2019 by Delphix. All rights reserved. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * See abd.c for a general overview of the arc buffered data (ABD). 28eda14cbcSMatt Macy * 29eda14cbcSMatt Macy * Linear buffers act exactly like normal buffers and are always mapped into the 30eda14cbcSMatt Macy * kernel's virtual memory space, while scattered ABD data chunks are allocated 31eda14cbcSMatt Macy * as physical pages and then mapped in only while they are actually being 32eda14cbcSMatt Macy * accessed through one of the abd_* library functions. Using scattered ABDs 33eda14cbcSMatt Macy * provides several benefits: 34eda14cbcSMatt Macy * 35eda14cbcSMatt Macy * (1) They avoid use of kmem_*, preventing performance problems where running 36eda14cbcSMatt Macy * kmem_reap on very large memory systems never finishes and causes 37eda14cbcSMatt Macy * constant TLB shootdowns. 38eda14cbcSMatt Macy * 39eda14cbcSMatt Macy * (2) Fragmentation is less of an issue since when we are at the limit of 40eda14cbcSMatt Macy * allocatable space, we won't have to search around for a long free 41eda14cbcSMatt Macy * hole in the VA space for large ARC allocations. Each chunk is mapped in 42eda14cbcSMatt Macy * individually, so even if we are using HIGHMEM (see next point) we 43eda14cbcSMatt Macy * wouldn't need to worry about finding a contiguous address range. 44eda14cbcSMatt Macy * 45eda14cbcSMatt Macy * (3) If we are not using HIGHMEM, then all physical memory is always 46eda14cbcSMatt Macy * mapped into the kernel's address space, so we also avoid the map / 47eda14cbcSMatt Macy * unmap costs on each ABD access. 48eda14cbcSMatt Macy * 49eda14cbcSMatt Macy * If we are not using HIGHMEM, scattered buffers which have only one chunk 50eda14cbcSMatt Macy * can be treated as linear buffers, because they are contiguous in the 51eda14cbcSMatt Macy * kernel's virtual address space. See abd_alloc_chunks() for details. 52eda14cbcSMatt Macy */ 53eda14cbcSMatt Macy 54eda14cbcSMatt Macy #include <sys/abd_impl.h> 55eda14cbcSMatt Macy #include <sys/param.h> 56eda14cbcSMatt Macy #include <sys/zio.h> 57eda14cbcSMatt Macy #include <sys/arc.h> 58eda14cbcSMatt Macy #include <sys/zfs_context.h> 59eda14cbcSMatt Macy #include <sys/zfs_znode.h> 60eda14cbcSMatt Macy #ifdef _KERNEL 61eda14cbcSMatt Macy #include <linux/kmap_compat.h> 62eda14cbcSMatt Macy #include <linux/scatterlist.h> 63eda14cbcSMatt Macy #else 64eda14cbcSMatt Macy #define MAX_ORDER 1 65eda14cbcSMatt Macy #endif 66eda14cbcSMatt Macy 67eda14cbcSMatt Macy typedef struct abd_stats { 68eda14cbcSMatt Macy kstat_named_t abdstat_struct_size; 69eda14cbcSMatt Macy kstat_named_t abdstat_linear_cnt; 70eda14cbcSMatt Macy kstat_named_t abdstat_linear_data_size; 71eda14cbcSMatt Macy kstat_named_t abdstat_scatter_cnt; 72eda14cbcSMatt Macy kstat_named_t abdstat_scatter_data_size; 73eda14cbcSMatt Macy kstat_named_t abdstat_scatter_chunk_waste; 74eda14cbcSMatt Macy kstat_named_t abdstat_scatter_orders[MAX_ORDER]; 75eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_multi_chunk; 76eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_multi_zone; 77eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_alloc_retry; 78eda14cbcSMatt Macy kstat_named_t abdstat_scatter_sg_table_retry; 79eda14cbcSMatt Macy } abd_stats_t; 80eda14cbcSMatt Macy 81eda14cbcSMatt Macy static abd_stats_t abd_stats = { 82eda14cbcSMatt Macy /* Amount of memory occupied by all of the abd_t struct allocations */ 83eda14cbcSMatt Macy { "struct_size", KSTAT_DATA_UINT64 }, 84eda14cbcSMatt Macy /* 85eda14cbcSMatt Macy * The number of linear ABDs which are currently allocated, excluding 86eda14cbcSMatt Macy * ABDs which don't own their data (for instance the ones which were 87eda14cbcSMatt Macy * allocated through abd_get_offset() and abd_get_from_buf()). If an 88eda14cbcSMatt Macy * ABD takes ownership of its buf then it will become tracked. 89eda14cbcSMatt Macy */ 90eda14cbcSMatt Macy { "linear_cnt", KSTAT_DATA_UINT64 }, 91eda14cbcSMatt Macy /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 92eda14cbcSMatt Macy { "linear_data_size", KSTAT_DATA_UINT64 }, 93eda14cbcSMatt Macy /* 94eda14cbcSMatt Macy * The number of scatter ABDs which are currently allocated, excluding 95eda14cbcSMatt Macy * ABDs which don't own their data (for instance the ones which were 96eda14cbcSMatt Macy * allocated through abd_get_offset()). 97eda14cbcSMatt Macy */ 98eda14cbcSMatt Macy { "scatter_cnt", KSTAT_DATA_UINT64 }, 99eda14cbcSMatt Macy /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 100eda14cbcSMatt Macy { "scatter_data_size", KSTAT_DATA_UINT64 }, 101eda14cbcSMatt Macy /* 102eda14cbcSMatt Macy * The amount of space wasted at the end of the last chunk across all 103eda14cbcSMatt Macy * scatter ABDs tracked by scatter_cnt. 104eda14cbcSMatt Macy */ 105eda14cbcSMatt Macy { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 106eda14cbcSMatt Macy /* 107eda14cbcSMatt Macy * The number of compound allocations of a given order. These 108eda14cbcSMatt Macy * allocations are spread over all currently allocated ABDs, and 109eda14cbcSMatt Macy * act as a measure of memory fragmentation. 110eda14cbcSMatt Macy */ 111eda14cbcSMatt Macy { { "scatter_order_N", KSTAT_DATA_UINT64 } }, 112eda14cbcSMatt Macy /* 113eda14cbcSMatt Macy * The number of scatter ABDs which contain multiple chunks. 114eda14cbcSMatt Macy * ABDs are preferentially allocated from the minimum number of 115eda14cbcSMatt Macy * contiguous multi-page chunks, a single chunk is optimal. 116eda14cbcSMatt Macy */ 117eda14cbcSMatt Macy { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, 118eda14cbcSMatt Macy /* 119eda14cbcSMatt Macy * The number of scatter ABDs which are split across memory zones. 120eda14cbcSMatt Macy * ABDs are preferentially allocated using pages from a single zone. 121eda14cbcSMatt Macy */ 122eda14cbcSMatt Macy { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, 123eda14cbcSMatt Macy /* 124eda14cbcSMatt Macy * The total number of retries encountered when attempting to 125eda14cbcSMatt Macy * allocate the pages to populate the scatter ABD. 126eda14cbcSMatt Macy */ 127eda14cbcSMatt Macy { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, 128eda14cbcSMatt Macy /* 129eda14cbcSMatt Macy * The total number of retries encountered when attempting to 130eda14cbcSMatt Macy * allocate the sg table for an ABD. 131eda14cbcSMatt Macy */ 132eda14cbcSMatt Macy { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, 133eda14cbcSMatt Macy }; 134eda14cbcSMatt Macy 135eda14cbcSMatt Macy #define abd_for_each_sg(abd, sg, n, i) \ 136eda14cbcSMatt Macy for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) 137eda14cbcSMatt Macy 138eda14cbcSMatt Macy unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; 139eda14cbcSMatt Macy 140eda14cbcSMatt Macy /* 141eda14cbcSMatt Macy * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 142eda14cbcSMatt Macy * ABD's. Smaller allocations will use linear ABD's which uses 143eda14cbcSMatt Macy * zio_[data_]buf_alloc(). 144eda14cbcSMatt Macy * 145eda14cbcSMatt Macy * Scatter ABD's use at least one page each, so sub-page allocations waste 146eda14cbcSMatt Macy * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 147eda14cbcSMatt Macy * half of each page). Using linear ABD's for small allocations means that 148eda14cbcSMatt Macy * they will be put on slabs which contain many allocations. This can 149eda14cbcSMatt Macy * improve memory efficiency, but it also makes it much harder for ARC 150eda14cbcSMatt Macy * evictions to actually free pages, because all the buffers on one slab need 151eda14cbcSMatt Macy * to be freed in order for the slab (and underlying pages) to be freed. 152eda14cbcSMatt Macy * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 153eda14cbcSMatt Macy * possible for them to actually waste more memory than scatter (one page per 154eda14cbcSMatt Macy * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 155eda14cbcSMatt Macy * 156eda14cbcSMatt Macy * Spill blocks are typically 512B and are heavily used on systems running 157eda14cbcSMatt Macy * selinux with the default dnode size and the `xattr=sa` property set. 158eda14cbcSMatt Macy * 159eda14cbcSMatt Macy * By default we use linear allocations for 512B and 1KB, and scatter 160eda14cbcSMatt Macy * allocations for larger (1.5KB and up). 161eda14cbcSMatt Macy */ 162eda14cbcSMatt Macy int zfs_abd_scatter_min_size = 512 * 3; 163eda14cbcSMatt Macy 164eda14cbcSMatt Macy /* 165eda14cbcSMatt Macy * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are 166eda14cbcSMatt Macy * just a single zero'd page. This allows us to conserve memory by 167eda14cbcSMatt Macy * only using a single zero page for the scatterlist. 168eda14cbcSMatt Macy */ 169eda14cbcSMatt Macy abd_t *abd_zero_scatter = NULL; 170eda14cbcSMatt Macy 171eda14cbcSMatt Macy struct page; 172eda14cbcSMatt Macy /* 173eda14cbcSMatt Macy * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is 174eda14cbcSMatt Macy * assigned to set each of the pages of abd_zero_scatter. 175eda14cbcSMatt Macy */ 176eda14cbcSMatt Macy static struct page *abd_zero_page = NULL; 177eda14cbcSMatt Macy 178eda14cbcSMatt Macy static kmem_cache_t *abd_cache = NULL; 179eda14cbcSMatt Macy static kstat_t *abd_ksp; 180eda14cbcSMatt Macy 1817877fdebSMatt Macy static uint_t 182eda14cbcSMatt Macy abd_chunkcnt_for_bytes(size_t size) 183eda14cbcSMatt Macy { 184eda14cbcSMatt Macy return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); 185eda14cbcSMatt Macy } 186eda14cbcSMatt Macy 187eda14cbcSMatt Macy abd_t * 188184c1b94SMartin Matuska abd_alloc_struct_impl(size_t size) 189eda14cbcSMatt Macy { 190eda14cbcSMatt Macy /* 191eda14cbcSMatt Macy * In Linux we do not use the size passed in during ABD 192eda14cbcSMatt Macy * allocation, so we just ignore it. 193eda14cbcSMatt Macy */ 194eda14cbcSMatt Macy abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); 195eda14cbcSMatt Macy ASSERT3P(abd, !=, NULL); 196eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); 197eda14cbcSMatt Macy 198eda14cbcSMatt Macy return (abd); 199eda14cbcSMatt Macy } 200eda14cbcSMatt Macy 201eda14cbcSMatt Macy void 202184c1b94SMartin Matuska abd_free_struct_impl(abd_t *abd) 203eda14cbcSMatt Macy { 204eda14cbcSMatt Macy kmem_cache_free(abd_cache, abd); 205eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); 206eda14cbcSMatt Macy } 207eda14cbcSMatt Macy 208eda14cbcSMatt Macy #ifdef _KERNEL 209eda14cbcSMatt Macy /* 210eda14cbcSMatt Macy * Mark zfs data pages so they can be excluded from kernel crash dumps 211eda14cbcSMatt Macy */ 212eda14cbcSMatt Macy #ifdef _LP64 213eda14cbcSMatt Macy #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E 214eda14cbcSMatt Macy 215eda14cbcSMatt Macy static inline void 216eda14cbcSMatt Macy abd_mark_zfs_page(struct page *page) 217eda14cbcSMatt Macy { 218eda14cbcSMatt Macy get_page(page); 219eda14cbcSMatt Macy SetPagePrivate(page); 220eda14cbcSMatt Macy set_page_private(page, ABD_FILE_CACHE_PAGE); 221eda14cbcSMatt Macy } 222eda14cbcSMatt Macy 223eda14cbcSMatt Macy static inline void 224eda14cbcSMatt Macy abd_unmark_zfs_page(struct page *page) 225eda14cbcSMatt Macy { 226eda14cbcSMatt Macy set_page_private(page, 0UL); 227eda14cbcSMatt Macy ClearPagePrivate(page); 228eda14cbcSMatt Macy put_page(page); 229eda14cbcSMatt Macy } 230eda14cbcSMatt Macy #else 231eda14cbcSMatt Macy #define abd_mark_zfs_page(page) 232eda14cbcSMatt Macy #define abd_unmark_zfs_page(page) 233eda14cbcSMatt Macy #endif /* _LP64 */ 234eda14cbcSMatt Macy 235eda14cbcSMatt Macy #ifndef CONFIG_HIGHMEM 236eda14cbcSMatt Macy 237eda14cbcSMatt Macy #ifndef __GFP_RECLAIM 238eda14cbcSMatt Macy #define __GFP_RECLAIM __GFP_WAIT 239eda14cbcSMatt Macy #endif 240eda14cbcSMatt Macy 241eda14cbcSMatt Macy /* 242eda14cbcSMatt Macy * The goal is to minimize fragmentation by preferentially populating ABDs 243eda14cbcSMatt Macy * with higher order compound pages from a single zone. Allocation size is 244eda14cbcSMatt Macy * progressively decreased until it can be satisfied without performing 245eda14cbcSMatt Macy * reclaim or compaction. When necessary this function will degenerate to 246eda14cbcSMatt Macy * allocating individual pages and allowing reclaim to satisfy allocations. 247eda14cbcSMatt Macy */ 248eda14cbcSMatt Macy void 249eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 250eda14cbcSMatt Macy { 251eda14cbcSMatt Macy struct list_head pages; 252eda14cbcSMatt Macy struct sg_table table; 253eda14cbcSMatt Macy struct scatterlist *sg; 254eda14cbcSMatt Macy struct page *page, *tmp_page = NULL; 255eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 256eda14cbcSMatt Macy gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; 257eda14cbcSMatt Macy int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); 258eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(size); 259eda14cbcSMatt Macy int chunks = 0, zones = 0; 260eda14cbcSMatt Macy size_t remaining_size; 261eda14cbcSMatt Macy int nid = NUMA_NO_NODE; 262eda14cbcSMatt Macy int alloc_pages = 0; 263eda14cbcSMatt Macy 264eda14cbcSMatt Macy INIT_LIST_HEAD(&pages); 265eda14cbcSMatt Macy 266eda14cbcSMatt Macy while (alloc_pages < nr_pages) { 267eda14cbcSMatt Macy unsigned chunk_pages; 268eda14cbcSMatt Macy int order; 269eda14cbcSMatt Macy 270eda14cbcSMatt Macy order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); 271eda14cbcSMatt Macy chunk_pages = (1U << order); 272eda14cbcSMatt Macy 273eda14cbcSMatt Macy page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); 274eda14cbcSMatt Macy if (page == NULL) { 275eda14cbcSMatt Macy if (order == 0) { 276eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 277eda14cbcSMatt Macy schedule_timeout_interruptible(1); 278eda14cbcSMatt Macy } else { 279eda14cbcSMatt Macy max_order = MAX(0, order - 1); 280eda14cbcSMatt Macy } 281eda14cbcSMatt Macy continue; 282eda14cbcSMatt Macy } 283eda14cbcSMatt Macy 284eda14cbcSMatt Macy list_add_tail(&page->lru, &pages); 285eda14cbcSMatt Macy 286eda14cbcSMatt Macy if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) 287eda14cbcSMatt Macy zones++; 288eda14cbcSMatt Macy 289eda14cbcSMatt Macy nid = page_to_nid(page); 290eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_orders[order]); 291eda14cbcSMatt Macy chunks++; 292eda14cbcSMatt Macy alloc_pages += chunk_pages; 293eda14cbcSMatt Macy } 294eda14cbcSMatt Macy 295eda14cbcSMatt Macy ASSERT3S(alloc_pages, ==, nr_pages); 296eda14cbcSMatt Macy 297eda14cbcSMatt Macy while (sg_alloc_table(&table, chunks, gfp)) { 298eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 299eda14cbcSMatt Macy schedule_timeout_interruptible(1); 300eda14cbcSMatt Macy } 301eda14cbcSMatt Macy 302eda14cbcSMatt Macy sg = table.sgl; 303eda14cbcSMatt Macy remaining_size = size; 304eda14cbcSMatt Macy list_for_each_entry_safe(page, tmp_page, &pages, lru) { 305eda14cbcSMatt Macy size_t sg_size = MIN(PAGESIZE << compound_order(page), 306eda14cbcSMatt Macy remaining_size); 307eda14cbcSMatt Macy sg_set_page(sg, page, sg_size, 0); 308eda14cbcSMatt Macy abd_mark_zfs_page(page); 309eda14cbcSMatt Macy remaining_size -= sg_size; 310eda14cbcSMatt Macy 311eda14cbcSMatt Macy sg = sg_next(sg); 312eda14cbcSMatt Macy list_del(&page->lru); 313eda14cbcSMatt Macy } 314eda14cbcSMatt Macy 315eda14cbcSMatt Macy /* 316eda14cbcSMatt Macy * These conditions ensure that a possible transformation to a linear 317eda14cbcSMatt Macy * ABD would be valid. 318eda14cbcSMatt Macy */ 319eda14cbcSMatt Macy ASSERT(!PageHighMem(sg_page(table.sgl))); 320eda14cbcSMatt Macy ASSERT0(ABD_SCATTER(abd).abd_offset); 321eda14cbcSMatt Macy 322eda14cbcSMatt Macy if (table.nents == 1) { 323eda14cbcSMatt Macy /* 324eda14cbcSMatt Macy * Since there is only one entry, this ABD can be represented 325eda14cbcSMatt Macy * as a linear buffer. All single-page (4K) ABD's can be 326eda14cbcSMatt Macy * represented this way. Some multi-page ABD's can also be 327eda14cbcSMatt Macy * represented this way, if we were able to allocate a single 328eda14cbcSMatt Macy * "chunk" (higher-order "page" which represents a power-of-2 329eda14cbcSMatt Macy * series of physically-contiguous pages). This is often the 330eda14cbcSMatt Macy * case for 2-page (8K) ABD's. 331eda14cbcSMatt Macy * 332eda14cbcSMatt Macy * Representing a single-entry scatter ABD as a linear ABD 333eda14cbcSMatt Macy * has the performance advantage of avoiding the copy (and 334eda14cbcSMatt Macy * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. 335eda14cbcSMatt Macy * A performance increase of around 5% has been observed for 336eda14cbcSMatt Macy * ARC-cached reads (of small blocks which can take advantage 337eda14cbcSMatt Macy * of this). 338eda14cbcSMatt Macy * 339eda14cbcSMatt Macy * Note that this optimization is only possible because the 340eda14cbcSMatt Macy * pages are always mapped into the kernel's address space. 341eda14cbcSMatt Macy * This is not the case for highmem pages, so the 342eda14cbcSMatt Macy * optimization can not be made there. 343eda14cbcSMatt Macy */ 344eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_LINEAR; 345eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; 346eda14cbcSMatt Macy abd->abd_u.abd_linear.abd_sgl = table.sgl; 347eda14cbcSMatt Macy ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); 348eda14cbcSMatt Macy } else if (table.nents > 1) { 349eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 350eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 351eda14cbcSMatt Macy 352eda14cbcSMatt Macy if (zones) { 353eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); 354eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_ZONE; 355eda14cbcSMatt Macy } 356eda14cbcSMatt Macy 357eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = table.sgl; 358eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = table.nents; 359eda14cbcSMatt Macy } 360eda14cbcSMatt Macy } 361eda14cbcSMatt Macy #else 362eda14cbcSMatt Macy 363eda14cbcSMatt Macy /* 364eda14cbcSMatt Macy * Allocate N individual pages to construct a scatter ABD. This function 365eda14cbcSMatt Macy * makes no attempt to request contiguous pages and requires the minimal 366eda14cbcSMatt Macy * number of kernel interfaces. It's designed for maximum compatibility. 367eda14cbcSMatt Macy */ 368eda14cbcSMatt Macy void 369eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 370eda14cbcSMatt Macy { 371eda14cbcSMatt Macy struct scatterlist *sg = NULL; 372eda14cbcSMatt Macy struct sg_table table; 373eda14cbcSMatt Macy struct page *page; 374eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 375eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(size); 376eda14cbcSMatt Macy int i = 0; 377eda14cbcSMatt Macy 378eda14cbcSMatt Macy while (sg_alloc_table(&table, nr_pages, gfp)) { 379eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 380eda14cbcSMatt Macy schedule_timeout_interruptible(1); 381eda14cbcSMatt Macy } 382eda14cbcSMatt Macy 383eda14cbcSMatt Macy ASSERT3U(table.nents, ==, nr_pages); 384eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = table.sgl; 385eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = nr_pages; 386eda14cbcSMatt Macy 387eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 388eda14cbcSMatt Macy while ((page = __page_cache_alloc(gfp)) == NULL) { 389eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 390eda14cbcSMatt Macy schedule_timeout_interruptible(1); 391eda14cbcSMatt Macy } 392eda14cbcSMatt Macy 393eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_orders[0]); 394eda14cbcSMatt Macy sg_set_page(sg, page, PAGESIZE, 0); 395eda14cbcSMatt Macy abd_mark_zfs_page(page); 396eda14cbcSMatt Macy } 397eda14cbcSMatt Macy 398eda14cbcSMatt Macy if (nr_pages > 1) { 399eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 400eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 401eda14cbcSMatt Macy } 402eda14cbcSMatt Macy } 403eda14cbcSMatt Macy #endif /* !CONFIG_HIGHMEM */ 404eda14cbcSMatt Macy 405eda14cbcSMatt Macy /* 406eda14cbcSMatt Macy * This must be called if any of the sg_table allocation functions 407eda14cbcSMatt Macy * are called. 408eda14cbcSMatt Macy */ 409eda14cbcSMatt Macy static void 410eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd) 411eda14cbcSMatt Macy { 412eda14cbcSMatt Macy struct sg_table table; 413eda14cbcSMatt Macy 414eda14cbcSMatt Macy table.sgl = ABD_SCATTER(abd).abd_sgl; 415eda14cbcSMatt Macy table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; 416eda14cbcSMatt Macy sg_free_table(&table); 417eda14cbcSMatt Macy } 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy void 420eda14cbcSMatt Macy abd_free_chunks(abd_t *abd) 421eda14cbcSMatt Macy { 422eda14cbcSMatt Macy struct scatterlist *sg = NULL; 423eda14cbcSMatt Macy struct page *page; 424eda14cbcSMatt Macy int nr_pages = ABD_SCATTER(abd).abd_nents; 425eda14cbcSMatt Macy int order, i = 0; 426eda14cbcSMatt Macy 427eda14cbcSMatt Macy if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) 428eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); 429eda14cbcSMatt Macy 430eda14cbcSMatt Macy if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) 431eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 432eda14cbcSMatt Macy 433eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 434eda14cbcSMatt Macy page = sg_page(sg); 435eda14cbcSMatt Macy abd_unmark_zfs_page(page); 436eda14cbcSMatt Macy order = compound_order(page); 437eda14cbcSMatt Macy __free_pages(page, order); 438eda14cbcSMatt Macy ASSERT3U(sg->length, <=, PAGE_SIZE << order); 439eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); 440eda14cbcSMatt Macy } 441eda14cbcSMatt Macy abd_free_sg_table(abd); 442eda14cbcSMatt Macy } 443eda14cbcSMatt Macy 444eda14cbcSMatt Macy /* 445eda14cbcSMatt Macy * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in 446eda14cbcSMatt Macy * the scatterlist will be set to the zero'd out buffer abd_zero_page. 447eda14cbcSMatt Macy */ 448eda14cbcSMatt Macy static void 449eda14cbcSMatt Macy abd_alloc_zero_scatter(void) 450eda14cbcSMatt Macy { 451eda14cbcSMatt Macy struct scatterlist *sg = NULL; 452eda14cbcSMatt Macy struct sg_table table; 453eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 454eda14cbcSMatt Macy gfp_t gfp_zero_page = gfp | __GFP_ZERO; 455eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 456eda14cbcSMatt Macy int i = 0; 457eda14cbcSMatt Macy 458eda14cbcSMatt Macy while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { 459eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 460eda14cbcSMatt Macy schedule_timeout_interruptible(1); 461eda14cbcSMatt Macy } 462eda14cbcSMatt Macy abd_mark_zfs_page(abd_zero_page); 463eda14cbcSMatt Macy 464eda14cbcSMatt Macy while (sg_alloc_table(&table, nr_pages, gfp)) { 465eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 466eda14cbcSMatt Macy schedule_timeout_interruptible(1); 467eda14cbcSMatt Macy } 468eda14cbcSMatt Macy ASSERT3U(table.nents, ==, nr_pages); 469eda14cbcSMatt Macy 470eda14cbcSMatt Macy abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 471184c1b94SMartin Matuska abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 472eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 473eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; 474eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 475eda14cbcSMatt Macy abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 476eda14cbcSMatt Macy abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 477eda14cbcSMatt Macy 478eda14cbcSMatt Macy abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 479eda14cbcSMatt Macy sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 480eda14cbcSMatt Macy } 481eda14cbcSMatt Macy 482eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 483eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 484eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 485eda14cbcSMatt Macy } 486eda14cbcSMatt Macy 487eda14cbcSMatt Macy #else /* _KERNEL */ 488eda14cbcSMatt Macy 489eda14cbcSMatt Macy #ifndef PAGE_SHIFT 490eda14cbcSMatt Macy #define PAGE_SHIFT (highbit64(PAGESIZE)-1) 491eda14cbcSMatt Macy #endif 492eda14cbcSMatt Macy 493eda14cbcSMatt Macy #define zfs_kmap_atomic(chunk, km) ((void *)chunk) 494eda14cbcSMatt Macy #define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) 495eda14cbcSMatt Macy #define local_irq_save(flags) do { (void)(flags); } while (0) 496eda14cbcSMatt Macy #define local_irq_restore(flags) do { (void)(flags); } while (0) 497eda14cbcSMatt Macy #define nth_page(pg, i) \ 498eda14cbcSMatt Macy ((struct page *)((void *)(pg) + (i) * PAGESIZE)) 499eda14cbcSMatt Macy 500eda14cbcSMatt Macy struct scatterlist { 501eda14cbcSMatt Macy struct page *page; 502eda14cbcSMatt Macy int length; 503eda14cbcSMatt Macy int end; 504eda14cbcSMatt Macy }; 505eda14cbcSMatt Macy 506eda14cbcSMatt Macy static void 507eda14cbcSMatt Macy sg_init_table(struct scatterlist *sg, int nr) 508eda14cbcSMatt Macy { 509eda14cbcSMatt Macy memset(sg, 0, nr * sizeof (struct scatterlist)); 510eda14cbcSMatt Macy sg[nr - 1].end = 1; 511eda14cbcSMatt Macy } 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy /* 514eda14cbcSMatt Macy * This must be called if any of the sg_table allocation functions 515eda14cbcSMatt Macy * are called. 516eda14cbcSMatt Macy */ 517eda14cbcSMatt Macy static void 518eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd) 519eda14cbcSMatt Macy { 520eda14cbcSMatt Macy int nents = ABD_SCATTER(abd).abd_nents; 521eda14cbcSMatt Macy vmem_free(ABD_SCATTER(abd).abd_sgl, 522eda14cbcSMatt Macy nents * sizeof (struct scatterlist)); 523eda14cbcSMatt Macy } 524eda14cbcSMatt Macy 525eda14cbcSMatt Macy #define for_each_sg(sgl, sg, nr, i) \ 526eda14cbcSMatt Macy for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) 527eda14cbcSMatt Macy 528eda14cbcSMatt Macy static inline void 529eda14cbcSMatt Macy sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, 530eda14cbcSMatt Macy unsigned int offset) 531eda14cbcSMatt Macy { 532eda14cbcSMatt Macy /* currently we don't use offset */ 533eda14cbcSMatt Macy ASSERT(offset == 0); 534eda14cbcSMatt Macy sg->page = page; 535eda14cbcSMatt Macy sg->length = len; 536eda14cbcSMatt Macy } 537eda14cbcSMatt Macy 538eda14cbcSMatt Macy static inline struct page * 539eda14cbcSMatt Macy sg_page(struct scatterlist *sg) 540eda14cbcSMatt Macy { 541eda14cbcSMatt Macy return (sg->page); 542eda14cbcSMatt Macy } 543eda14cbcSMatt Macy 544eda14cbcSMatt Macy static inline struct scatterlist * 545eda14cbcSMatt Macy sg_next(struct scatterlist *sg) 546eda14cbcSMatt Macy { 547eda14cbcSMatt Macy if (sg->end) 548eda14cbcSMatt Macy return (NULL); 549eda14cbcSMatt Macy 550eda14cbcSMatt Macy return (sg + 1); 551eda14cbcSMatt Macy } 552eda14cbcSMatt Macy 553eda14cbcSMatt Macy void 554eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 555eda14cbcSMatt Macy { 556eda14cbcSMatt Macy unsigned nr_pages = abd_chunkcnt_for_bytes(size); 557eda14cbcSMatt Macy struct scatterlist *sg; 558eda14cbcSMatt Macy int i; 559eda14cbcSMatt Macy 560eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * 561eda14cbcSMatt Macy sizeof (struct scatterlist), KM_SLEEP); 562eda14cbcSMatt Macy sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); 563eda14cbcSMatt Macy 564eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 565eda14cbcSMatt Macy struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 566eda14cbcSMatt Macy sg_set_page(sg, p, PAGESIZE, 0); 567eda14cbcSMatt Macy } 568eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = nr_pages; 569eda14cbcSMatt Macy } 570eda14cbcSMatt Macy 571eda14cbcSMatt Macy void 572eda14cbcSMatt Macy abd_free_chunks(abd_t *abd) 573eda14cbcSMatt Macy { 574eda14cbcSMatt Macy int i, n = ABD_SCATTER(abd).abd_nents; 575eda14cbcSMatt Macy struct scatterlist *sg; 576eda14cbcSMatt Macy 577eda14cbcSMatt Macy abd_for_each_sg(abd, sg, n, i) { 578eda14cbcSMatt Macy for (int j = 0; j < sg->length; j += PAGESIZE) { 579eda14cbcSMatt Macy struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); 580eda14cbcSMatt Macy umem_free(p, PAGESIZE); 581eda14cbcSMatt Macy } 582eda14cbcSMatt Macy } 583eda14cbcSMatt Macy abd_free_sg_table(abd); 584eda14cbcSMatt Macy } 585eda14cbcSMatt Macy 586eda14cbcSMatt Macy static void 587eda14cbcSMatt Macy abd_alloc_zero_scatter(void) 588eda14cbcSMatt Macy { 589eda14cbcSMatt Macy unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 590eda14cbcSMatt Macy struct scatterlist *sg; 591eda14cbcSMatt Macy int i; 592eda14cbcSMatt Macy 593eda14cbcSMatt Macy abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 594eda14cbcSMatt Macy memset(abd_zero_page, 0, PAGESIZE); 595eda14cbcSMatt Macy abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 596184c1b94SMartin Matuska abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 597eda14cbcSMatt Macy abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 598eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 599eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 600eda14cbcSMatt Macy abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 601eda14cbcSMatt Macy zfs_refcount_create(&abd_zero_scatter->abd_children); 602eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * 603eda14cbcSMatt Macy sizeof (struct scatterlist), KM_SLEEP); 604eda14cbcSMatt Macy 605eda14cbcSMatt Macy sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); 606eda14cbcSMatt Macy 607eda14cbcSMatt Macy abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 608eda14cbcSMatt Macy sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 609eda14cbcSMatt Macy } 610eda14cbcSMatt Macy 611eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 612eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 613eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 614eda14cbcSMatt Macy } 615eda14cbcSMatt Macy 616eda14cbcSMatt Macy #endif /* _KERNEL */ 617eda14cbcSMatt Macy 618eda14cbcSMatt Macy boolean_t 619eda14cbcSMatt Macy abd_size_alloc_linear(size_t size) 620eda14cbcSMatt Macy { 621eda14cbcSMatt Macy return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE); 622eda14cbcSMatt Macy } 623eda14cbcSMatt Macy 624eda14cbcSMatt Macy void 625eda14cbcSMatt Macy abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) 626eda14cbcSMatt Macy { 627eda14cbcSMatt Macy ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 628eda14cbcSMatt Macy int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; 629eda14cbcSMatt Macy if (op == ABDSTAT_INCR) { 630eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 631eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); 632eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); 633eda14cbcSMatt Macy arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); 634eda14cbcSMatt Macy } else { 635eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 636eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 637eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); 638eda14cbcSMatt Macy arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); 639eda14cbcSMatt Macy } 640eda14cbcSMatt Macy } 641eda14cbcSMatt Macy 642eda14cbcSMatt Macy void 643eda14cbcSMatt Macy abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) 644eda14cbcSMatt Macy { 645eda14cbcSMatt Macy ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 646eda14cbcSMatt Macy if (op == ABDSTAT_INCR) { 647eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_linear_cnt); 648eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 649eda14cbcSMatt Macy } else { 650eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 651eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 652eda14cbcSMatt Macy } 653eda14cbcSMatt Macy } 654eda14cbcSMatt Macy 655eda14cbcSMatt Macy void 656eda14cbcSMatt Macy abd_verify_scatter(abd_t *abd) 657eda14cbcSMatt Macy { 658eda14cbcSMatt Macy size_t n; 659eda14cbcSMatt Macy int i = 0; 660eda14cbcSMatt Macy struct scatterlist *sg = NULL; 661eda14cbcSMatt Macy 662eda14cbcSMatt Macy ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); 663eda14cbcSMatt Macy ASSERT3U(ABD_SCATTER(abd).abd_offset, <, 664eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl->length); 665eda14cbcSMatt Macy n = ABD_SCATTER(abd).abd_nents; 666eda14cbcSMatt Macy abd_for_each_sg(abd, sg, n, i) { 667eda14cbcSMatt Macy ASSERT3P(sg_page(sg), !=, NULL); 668eda14cbcSMatt Macy } 669eda14cbcSMatt Macy } 670eda14cbcSMatt Macy 671eda14cbcSMatt Macy static void 672eda14cbcSMatt Macy abd_free_zero_scatter(void) 673eda14cbcSMatt Macy { 674eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 675eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); 676eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 677eda14cbcSMatt Macy 678eda14cbcSMatt Macy abd_free_sg_table(abd_zero_scatter); 679eda14cbcSMatt Macy abd_free_struct(abd_zero_scatter); 680eda14cbcSMatt Macy abd_zero_scatter = NULL; 681eda14cbcSMatt Macy ASSERT3P(abd_zero_page, !=, NULL); 682eda14cbcSMatt Macy #if defined(_KERNEL) 683eda14cbcSMatt Macy abd_unmark_zfs_page(abd_zero_page); 684eda14cbcSMatt Macy __free_page(abd_zero_page); 685eda14cbcSMatt Macy #else 686eda14cbcSMatt Macy umem_free(abd_zero_page, PAGESIZE); 687eda14cbcSMatt Macy #endif /* _KERNEL */ 688eda14cbcSMatt Macy } 689eda14cbcSMatt Macy 690eda14cbcSMatt Macy void 691eda14cbcSMatt Macy abd_init(void) 692eda14cbcSMatt Macy { 693eda14cbcSMatt Macy int i; 694eda14cbcSMatt Macy 695eda14cbcSMatt Macy abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 696eda14cbcSMatt Macy 0, NULL, NULL, NULL, NULL, NULL, 0); 697eda14cbcSMatt Macy 698eda14cbcSMatt Macy abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 699eda14cbcSMatt Macy sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 700eda14cbcSMatt Macy if (abd_ksp != NULL) { 701eda14cbcSMatt Macy for (i = 0; i < MAX_ORDER; i++) { 702eda14cbcSMatt Macy snprintf(abd_stats.abdstat_scatter_orders[i].name, 703eda14cbcSMatt Macy KSTAT_STRLEN, "scatter_order_%d", i); 704eda14cbcSMatt Macy abd_stats.abdstat_scatter_orders[i].data_type = 705eda14cbcSMatt Macy KSTAT_DATA_UINT64; 706eda14cbcSMatt Macy } 707eda14cbcSMatt Macy abd_ksp->ks_data = &abd_stats; 708eda14cbcSMatt Macy kstat_install(abd_ksp); 709eda14cbcSMatt Macy } 710eda14cbcSMatt Macy 711eda14cbcSMatt Macy abd_alloc_zero_scatter(); 712eda14cbcSMatt Macy } 713eda14cbcSMatt Macy 714eda14cbcSMatt Macy void 715eda14cbcSMatt Macy abd_fini(void) 716eda14cbcSMatt Macy { 717eda14cbcSMatt Macy abd_free_zero_scatter(); 718eda14cbcSMatt Macy 719eda14cbcSMatt Macy if (abd_ksp != NULL) { 720eda14cbcSMatt Macy kstat_delete(abd_ksp); 721eda14cbcSMatt Macy abd_ksp = NULL; 722eda14cbcSMatt Macy } 723eda14cbcSMatt Macy 724eda14cbcSMatt Macy if (abd_cache) { 725eda14cbcSMatt Macy kmem_cache_destroy(abd_cache); 726eda14cbcSMatt Macy abd_cache = NULL; 727eda14cbcSMatt Macy } 728eda14cbcSMatt Macy } 729eda14cbcSMatt Macy 730eda14cbcSMatt Macy void 731eda14cbcSMatt Macy abd_free_linear_page(abd_t *abd) 732eda14cbcSMatt Macy { 733eda14cbcSMatt Macy /* Transform it back into a scatter ABD for freeing */ 734eda14cbcSMatt Macy struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; 735eda14cbcSMatt Macy abd->abd_flags &= ~ABD_FLAG_LINEAR; 736eda14cbcSMatt Macy abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; 737eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = 1; 738eda14cbcSMatt Macy ABD_SCATTER(abd).abd_offset = 0; 739eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = sg; 740eda14cbcSMatt Macy abd_free_chunks(abd); 741eda14cbcSMatt Macy 742eda14cbcSMatt Macy abd_update_scatter_stats(abd, ABDSTAT_DECR); 743eda14cbcSMatt Macy } 744eda14cbcSMatt Macy 745eda14cbcSMatt Macy /* 746eda14cbcSMatt Macy * If we're going to use this ABD for doing I/O using the block layer, the 747eda14cbcSMatt Macy * consumer of the ABD data doesn't care if it's scattered or not, and we don't 748eda14cbcSMatt Macy * plan to store this ABD in memory for a long period of time, we should 749eda14cbcSMatt Macy * allocate the ABD type that requires the least data copying to do the I/O. 750eda14cbcSMatt Macy * 751eda14cbcSMatt Macy * On Linux the optimal thing to do would be to use abd_get_offset() and 752eda14cbcSMatt Macy * construct a new ABD which shares the original pages thereby eliminating 753eda14cbcSMatt Macy * the copy. But for the moment a new linear ABD is allocated until this 754eda14cbcSMatt Macy * performance optimization can be implemented. 755eda14cbcSMatt Macy */ 756eda14cbcSMatt Macy abd_t * 757eda14cbcSMatt Macy abd_alloc_for_io(size_t size, boolean_t is_metadata) 758eda14cbcSMatt Macy { 759eda14cbcSMatt Macy return (abd_alloc(size, is_metadata)); 760eda14cbcSMatt Macy } 761eda14cbcSMatt Macy 762eda14cbcSMatt Macy abd_t * 763184c1b94SMartin Matuska abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off) 764eda14cbcSMatt Macy { 765eda14cbcSMatt Macy int i = 0; 766eda14cbcSMatt Macy struct scatterlist *sg = NULL; 767eda14cbcSMatt Macy 768eda14cbcSMatt Macy abd_verify(sabd); 769eda14cbcSMatt Macy ASSERT3U(off, <=, sabd->abd_size); 770eda14cbcSMatt Macy 771eda14cbcSMatt Macy size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; 772eda14cbcSMatt Macy 773184c1b94SMartin Matuska if (abd == NULL) 774eda14cbcSMatt Macy abd = abd_alloc_struct(0); 775eda14cbcSMatt Macy 776eda14cbcSMatt Macy /* 777eda14cbcSMatt Macy * Even if this buf is filesystem metadata, we only track that 778eda14cbcSMatt Macy * if we own the underlying data buffer, which is not true in 779eda14cbcSMatt Macy * this case. Therefore, we don't ever use ABD_FLAG_META here. 780eda14cbcSMatt Macy */ 781eda14cbcSMatt Macy 782eda14cbcSMatt Macy abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { 783eda14cbcSMatt Macy if (new_offset < sg->length) 784eda14cbcSMatt Macy break; 785eda14cbcSMatt Macy new_offset -= sg->length; 786eda14cbcSMatt Macy } 787eda14cbcSMatt Macy 788eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = sg; 789eda14cbcSMatt Macy ABD_SCATTER(abd).abd_offset = new_offset; 790eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; 791eda14cbcSMatt Macy 792eda14cbcSMatt Macy return (abd); 793eda14cbcSMatt Macy } 794eda14cbcSMatt Macy 795eda14cbcSMatt Macy /* 796eda14cbcSMatt Macy * Initialize the abd_iter. 797eda14cbcSMatt Macy */ 798eda14cbcSMatt Macy void 799eda14cbcSMatt Macy abd_iter_init(struct abd_iter *aiter, abd_t *abd) 800eda14cbcSMatt Macy { 801eda14cbcSMatt Macy ASSERT(!abd_is_gang(abd)); 802eda14cbcSMatt Macy abd_verify(abd); 803eda14cbcSMatt Macy aiter->iter_abd = abd; 804eda14cbcSMatt Macy aiter->iter_mapaddr = NULL; 805eda14cbcSMatt Macy aiter->iter_mapsize = 0; 806eda14cbcSMatt Macy aiter->iter_pos = 0; 807eda14cbcSMatt Macy if (abd_is_linear(abd)) { 808eda14cbcSMatt Macy aiter->iter_offset = 0; 809eda14cbcSMatt Macy aiter->iter_sg = NULL; 810eda14cbcSMatt Macy } else { 811eda14cbcSMatt Macy aiter->iter_offset = ABD_SCATTER(abd).abd_offset; 812eda14cbcSMatt Macy aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; 813eda14cbcSMatt Macy } 814eda14cbcSMatt Macy } 815eda14cbcSMatt Macy 816eda14cbcSMatt Macy /* 817eda14cbcSMatt Macy * This is just a helper function to see if we have exhausted the 818eda14cbcSMatt Macy * abd_iter and reached the end. 819eda14cbcSMatt Macy */ 820eda14cbcSMatt Macy boolean_t 821eda14cbcSMatt Macy abd_iter_at_end(struct abd_iter *aiter) 822eda14cbcSMatt Macy { 823eda14cbcSMatt Macy return (aiter->iter_pos == aiter->iter_abd->abd_size); 824eda14cbcSMatt Macy } 825eda14cbcSMatt Macy 826eda14cbcSMatt Macy /* 827eda14cbcSMatt Macy * Advance the iterator by a certain amount. Cannot be called when a chunk is 828eda14cbcSMatt Macy * in use. This can be safely called when the aiter has already exhausted, in 829eda14cbcSMatt Macy * which case this does nothing. 830eda14cbcSMatt Macy */ 831eda14cbcSMatt Macy void 832eda14cbcSMatt Macy abd_iter_advance(struct abd_iter *aiter, size_t amount) 833eda14cbcSMatt Macy { 834eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, ==, NULL); 835eda14cbcSMatt Macy ASSERT0(aiter->iter_mapsize); 836eda14cbcSMatt Macy 837eda14cbcSMatt Macy /* There's nothing left to advance to, so do nothing */ 838eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 839eda14cbcSMatt Macy return; 840eda14cbcSMatt Macy 841eda14cbcSMatt Macy aiter->iter_pos += amount; 842eda14cbcSMatt Macy aiter->iter_offset += amount; 843eda14cbcSMatt Macy if (!abd_is_linear(aiter->iter_abd)) { 844eda14cbcSMatt Macy while (aiter->iter_offset >= aiter->iter_sg->length) { 845eda14cbcSMatt Macy aiter->iter_offset -= aiter->iter_sg->length; 846eda14cbcSMatt Macy aiter->iter_sg = sg_next(aiter->iter_sg); 847eda14cbcSMatt Macy if (aiter->iter_sg == NULL) { 848eda14cbcSMatt Macy ASSERT0(aiter->iter_offset); 849eda14cbcSMatt Macy break; 850eda14cbcSMatt Macy } 851eda14cbcSMatt Macy } 852eda14cbcSMatt Macy } 853eda14cbcSMatt Macy } 854eda14cbcSMatt Macy 855eda14cbcSMatt Macy /* 856eda14cbcSMatt Macy * Map the current chunk into aiter. This can be safely called when the aiter 857eda14cbcSMatt Macy * has already exhausted, in which case this does nothing. 858eda14cbcSMatt Macy */ 859eda14cbcSMatt Macy void 860eda14cbcSMatt Macy abd_iter_map(struct abd_iter *aiter) 861eda14cbcSMatt Macy { 862eda14cbcSMatt Macy void *paddr; 863eda14cbcSMatt Macy size_t offset = 0; 864eda14cbcSMatt Macy 865eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, ==, NULL); 866eda14cbcSMatt Macy ASSERT0(aiter->iter_mapsize); 867eda14cbcSMatt Macy 868eda14cbcSMatt Macy /* There's nothing left to iterate over, so do nothing */ 869eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 870eda14cbcSMatt Macy return; 871eda14cbcSMatt Macy 872eda14cbcSMatt Macy if (abd_is_linear(aiter->iter_abd)) { 873eda14cbcSMatt Macy ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 874eda14cbcSMatt Macy offset = aiter->iter_offset; 875eda14cbcSMatt Macy aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 876eda14cbcSMatt Macy paddr = ABD_LINEAR_BUF(aiter->iter_abd); 877eda14cbcSMatt Macy } else { 878eda14cbcSMatt Macy offset = aiter->iter_offset; 879eda14cbcSMatt Macy aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, 880eda14cbcSMatt Macy aiter->iter_abd->abd_size - aiter->iter_pos); 881eda14cbcSMatt Macy 882eda14cbcSMatt Macy paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), 883eda14cbcSMatt Macy km_table[aiter->iter_km]); 884eda14cbcSMatt Macy } 885eda14cbcSMatt Macy 886eda14cbcSMatt Macy aiter->iter_mapaddr = (char *)paddr + offset; 887eda14cbcSMatt Macy } 888eda14cbcSMatt Macy 889eda14cbcSMatt Macy /* 890eda14cbcSMatt Macy * Unmap the current chunk from aiter. This can be safely called when the aiter 891eda14cbcSMatt Macy * has already exhausted, in which case this does nothing. 892eda14cbcSMatt Macy */ 893eda14cbcSMatt Macy void 894eda14cbcSMatt Macy abd_iter_unmap(struct abd_iter *aiter) 895eda14cbcSMatt Macy { 896eda14cbcSMatt Macy /* There's nothing left to unmap, so do nothing */ 897eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 898eda14cbcSMatt Macy return; 899eda14cbcSMatt Macy 900eda14cbcSMatt Macy if (!abd_is_linear(aiter->iter_abd)) { 901eda14cbcSMatt Macy /* LINTED E_FUNC_SET_NOT_USED */ 902eda14cbcSMatt Macy zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, 903eda14cbcSMatt Macy km_table[aiter->iter_km]); 904eda14cbcSMatt Macy } 905eda14cbcSMatt Macy 906eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, !=, NULL); 907eda14cbcSMatt Macy ASSERT3U(aiter->iter_mapsize, >, 0); 908eda14cbcSMatt Macy 909eda14cbcSMatt Macy aiter->iter_mapaddr = NULL; 910eda14cbcSMatt Macy aiter->iter_mapsize = 0; 911eda14cbcSMatt Macy } 912eda14cbcSMatt Macy 913eda14cbcSMatt Macy void 914eda14cbcSMatt Macy abd_cache_reap_now(void) 915eda14cbcSMatt Macy { 916eda14cbcSMatt Macy } 917eda14cbcSMatt Macy 918eda14cbcSMatt Macy #if defined(_KERNEL) 919eda14cbcSMatt Macy /* 920eda14cbcSMatt Macy * bio_nr_pages for ABD. 921eda14cbcSMatt Macy * @off is the offset in @abd 922eda14cbcSMatt Macy */ 923eda14cbcSMatt Macy unsigned long 924eda14cbcSMatt Macy abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) 925eda14cbcSMatt Macy { 926eda14cbcSMatt Macy unsigned long pos; 927eda14cbcSMatt Macy 928184c1b94SMartin Matuska if (abd_is_gang(abd)) { 929184c1b94SMartin Matuska unsigned long count = 0; 930eda14cbcSMatt Macy 931184c1b94SMartin Matuska for (abd_t *cabd = abd_gang_get_offset(abd, &off); 932184c1b94SMartin Matuska cabd != NULL && size != 0; 933184c1b94SMartin Matuska cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 934184c1b94SMartin Matuska ASSERT3U(off, <, cabd->abd_size); 935184c1b94SMartin Matuska int mysize = MIN(size, cabd->abd_size - off); 936184c1b94SMartin Matuska count += abd_nr_pages_off(cabd, mysize, off); 937184c1b94SMartin Matuska size -= mysize; 938184c1b94SMartin Matuska off = 0; 939184c1b94SMartin Matuska } 940184c1b94SMartin Matuska return (count); 941184c1b94SMartin Matuska } 942184c1b94SMartin Matuska 943eda14cbcSMatt Macy if (abd_is_linear(abd)) 944eda14cbcSMatt Macy pos = (unsigned long)abd_to_buf(abd) + off; 945eda14cbcSMatt Macy else 946eda14cbcSMatt Macy pos = ABD_SCATTER(abd).abd_offset + off; 947eda14cbcSMatt Macy 948184c1b94SMartin Matuska return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - 949184c1b94SMartin Matuska (pos >> PAGE_SHIFT)); 950eda14cbcSMatt Macy } 951eda14cbcSMatt Macy 952eda14cbcSMatt Macy static unsigned int 953eda14cbcSMatt Macy bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) 954eda14cbcSMatt Macy { 955eda14cbcSMatt Macy unsigned int offset, size, i; 956eda14cbcSMatt Macy struct page *page; 957eda14cbcSMatt Macy 958eda14cbcSMatt Macy offset = offset_in_page(buf_ptr); 959eda14cbcSMatt Macy for (i = 0; i < bio->bi_max_vecs; i++) { 960eda14cbcSMatt Macy size = PAGE_SIZE - offset; 961eda14cbcSMatt Macy 962eda14cbcSMatt Macy if (bio_size <= 0) 963eda14cbcSMatt Macy break; 964eda14cbcSMatt Macy 965eda14cbcSMatt Macy if (size > bio_size) 966eda14cbcSMatt Macy size = bio_size; 967eda14cbcSMatt Macy 968eda14cbcSMatt Macy if (is_vmalloc_addr(buf_ptr)) 969eda14cbcSMatt Macy page = vmalloc_to_page(buf_ptr); 970eda14cbcSMatt Macy else 971eda14cbcSMatt Macy page = virt_to_page(buf_ptr); 972eda14cbcSMatt Macy 973eda14cbcSMatt Macy /* 974eda14cbcSMatt Macy * Some network related block device uses tcp_sendpage, which 975eda14cbcSMatt Macy * doesn't behave well when using 0-count page, this is a 976eda14cbcSMatt Macy * safety net to catch them. 977eda14cbcSMatt Macy */ 978eda14cbcSMatt Macy ASSERT3S(page_count(page), >, 0); 979eda14cbcSMatt Macy 980eda14cbcSMatt Macy if (bio_add_page(bio, page, size, offset) != size) 981eda14cbcSMatt Macy break; 982eda14cbcSMatt Macy 983eda14cbcSMatt Macy buf_ptr += size; 984eda14cbcSMatt Macy bio_size -= size; 985eda14cbcSMatt Macy offset = 0; 986eda14cbcSMatt Macy } 987eda14cbcSMatt Macy 988eda14cbcSMatt Macy return (bio_size); 989eda14cbcSMatt Macy } 990eda14cbcSMatt Macy 991eda14cbcSMatt Macy /* 992eda14cbcSMatt Macy * bio_map for gang ABD. 993eda14cbcSMatt Macy */ 994eda14cbcSMatt Macy static unsigned int 995eda14cbcSMatt Macy abd_gang_bio_map_off(struct bio *bio, abd_t *abd, 996eda14cbcSMatt Macy unsigned int io_size, size_t off) 997eda14cbcSMatt Macy { 998eda14cbcSMatt Macy ASSERT(abd_is_gang(abd)); 999eda14cbcSMatt Macy 1000eda14cbcSMatt Macy for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1001eda14cbcSMatt Macy cabd != NULL; 1002eda14cbcSMatt Macy cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1003eda14cbcSMatt Macy ASSERT3U(off, <, cabd->abd_size); 1004eda14cbcSMatt Macy int size = MIN(io_size, cabd->abd_size - off); 1005eda14cbcSMatt Macy int remainder = abd_bio_map_off(bio, cabd, size, off); 1006eda14cbcSMatt Macy io_size -= (size - remainder); 1007eda14cbcSMatt Macy if (io_size == 0 || remainder > 0) 1008eda14cbcSMatt Macy return (io_size); 1009eda14cbcSMatt Macy off = 0; 1010eda14cbcSMatt Macy } 1011eda14cbcSMatt Macy ASSERT0(io_size); 1012eda14cbcSMatt Macy return (io_size); 1013eda14cbcSMatt Macy } 1014eda14cbcSMatt Macy 1015eda14cbcSMatt Macy /* 1016eda14cbcSMatt Macy * bio_map for ABD. 1017eda14cbcSMatt Macy * @off is the offset in @abd 1018eda14cbcSMatt Macy * Remaining IO size is returned 1019eda14cbcSMatt Macy */ 1020eda14cbcSMatt Macy unsigned int 1021eda14cbcSMatt Macy abd_bio_map_off(struct bio *bio, abd_t *abd, 1022eda14cbcSMatt Macy unsigned int io_size, size_t off) 1023eda14cbcSMatt Macy { 1024eda14cbcSMatt Macy struct abd_iter aiter; 1025eda14cbcSMatt Macy 1026eda14cbcSMatt Macy ASSERT3U(io_size, <=, abd->abd_size - off); 1027eda14cbcSMatt Macy if (abd_is_linear(abd)) 1028eda14cbcSMatt Macy return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); 1029eda14cbcSMatt Macy 1030eda14cbcSMatt Macy ASSERT(!abd_is_linear(abd)); 1031eda14cbcSMatt Macy if (abd_is_gang(abd)) 1032eda14cbcSMatt Macy return (abd_gang_bio_map_off(bio, abd, io_size, off)); 1033eda14cbcSMatt Macy 1034eda14cbcSMatt Macy abd_iter_init(&aiter, abd); 1035eda14cbcSMatt Macy abd_iter_advance(&aiter, off); 1036eda14cbcSMatt Macy 1037184c1b94SMartin Matuska for (int i = 0; i < bio->bi_max_vecs; i++) { 1038eda14cbcSMatt Macy struct page *pg; 1039eda14cbcSMatt Macy size_t len, sgoff, pgoff; 1040eda14cbcSMatt Macy struct scatterlist *sg; 1041eda14cbcSMatt Macy 1042eda14cbcSMatt Macy if (io_size <= 0) 1043eda14cbcSMatt Macy break; 1044eda14cbcSMatt Macy 1045eda14cbcSMatt Macy sg = aiter.iter_sg; 1046eda14cbcSMatt Macy sgoff = aiter.iter_offset; 1047eda14cbcSMatt Macy pgoff = sgoff & (PAGESIZE - 1); 1048eda14cbcSMatt Macy len = MIN(io_size, PAGESIZE - pgoff); 1049eda14cbcSMatt Macy ASSERT(len > 0); 1050eda14cbcSMatt Macy 1051eda14cbcSMatt Macy pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); 1052eda14cbcSMatt Macy if (bio_add_page(bio, pg, len, pgoff) != len) 1053eda14cbcSMatt Macy break; 1054eda14cbcSMatt Macy 1055eda14cbcSMatt Macy io_size -= len; 1056eda14cbcSMatt Macy abd_iter_advance(&aiter, len); 1057eda14cbcSMatt Macy } 1058eda14cbcSMatt Macy 1059eda14cbcSMatt Macy return (io_size); 1060eda14cbcSMatt Macy } 1061eda14cbcSMatt Macy 1062eda14cbcSMatt Macy /* Tunable Parameters */ 1063eda14cbcSMatt Macy module_param(zfs_abd_scatter_enabled, int, 0644); 1064eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_enabled, 1065eda14cbcSMatt Macy "Toggle whether ABD allocations must be linear."); 1066eda14cbcSMatt Macy module_param(zfs_abd_scatter_min_size, int, 0644); 1067eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_min_size, 1068eda14cbcSMatt Macy "Minimum size of scatter allocations."); 1069eda14cbcSMatt Macy /* CSTYLED */ 1070eda14cbcSMatt Macy module_param(zfs_abd_scatter_max_order, uint, 0644); 1071eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_max_order, 1072eda14cbcSMatt Macy "Maximum order allocation used for a scatter ABD."); 1073eda14cbcSMatt Macy #endif 1074