1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
23  */
24 
25 #ifndef _SYS_BRT_IMPL_H
26 #define	_SYS_BRT_IMPL_H
27 
28 #ifdef	__cplusplus
29 extern "C" {
30 #endif
31 
32 /*
33  * BRT - Block Reference Table.
34  */
35 #define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
36 
37 /*
38  * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
39  * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
40  * Each element in this array represents how many BRT entries do we have in this
41  * chunk of storage. We always load this entire array into memory and update as
42  * needed. By having it in memory we can quickly tell (during zio_free()) if
43  * there are any BRT entries that we might need to update.
44  *
45  * This value cannot be larger than 16MB, at least as long as we support
46  * 512 byte block sizes. With 512 byte block size we can have exactly
47  * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
48  * many for a 16bit counter.
49  */
50 #define	BRT_RANGESIZE	(16 * 1024 * 1024)
51 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
52 	"BRT_RANGESIZE is too large.");
53 /*
54  * We don't want to update the whole structure every time. Maintain bitmap
55  * of dirty blocks within the regions, so that a single bit represents a
56  * block size of entcounts. For example if we have a 1PB vdev then all
57  * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
58  * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
59  * the whole 128MB on disk when we have updated only a single entcount.
60  * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
61  * is represented by a single bit. This gives us 4096 bits. A set bit in the
62  * bitmap means that we had a change in at least one of the 16384 entcounts
63  * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
64  */
65 #define	BRT_BLOCKSIZE	(32 * 1024)
66 #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
67 	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
68 
69 #define	BRT_LITTLE_ENDIAN	0
70 #define	BRT_BIG_ENDIAN		1
71 #ifdef _ZFS_LITTLE_ENDIAN
72 #define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
73 #define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
74 #else
75 #define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
76 #define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
77 #endif
78 
79 typedef struct brt_vdev_phys {
80 	uint64_t	bvp_mos_entries;
81 	uint64_t	bvp_size;
82 	uint64_t	bvp_byteorder;
83 	uint64_t	bvp_totalcount;
84 	uint64_t	bvp_rangesize;
85 	uint64_t	bvp_usedspace;
86 	uint64_t	bvp_savedspace;
87 } brt_vdev_phys_t;
88 
89 typedef struct brt_vdev {
90 	/*
91 	 * VDEV id.
92 	 */
93 	uint64_t	bv_vdevid;
94 	/*
95 	 * Is the structure initiated?
96 	 * (bv_entcount and bv_bitmap are allocated?)
97 	 */
98 	boolean_t	bv_initiated;
99 	/*
100 	 * Object number in the MOS for the entcount array and brt_vdev_phys.
101 	 */
102 	uint64_t	bv_mos_brtvdev;
103 	/*
104 	 * Object number in the MOS for the entries table.
105 	 */
106 	uint64_t	bv_mos_entries;
107 	/*
108 	 * Entries to sync.
109 	 */
110 	avl_tree_t	bv_tree;
111 	/*
112 	 * Does the bv_entcount[] array needs byte swapping?
113 	 */
114 	boolean_t	bv_need_byteswap;
115 	/*
116 	 * Number of entries in the bv_entcount[] array.
117 	 */
118 	uint64_t	bv_size;
119 	/*
120 	 * This is the array with BRT entry count per BRT_RANGESIZE.
121 	 */
122 	uint16_t	*bv_entcount;
123 	/*
124 	 * Sum of all bv_entcount[]s.
125 	 */
126 	uint64_t	bv_totalcount;
127 	/*
128 	 * Space on disk occupied by cloned blocks (without compression).
129 	 */
130 	uint64_t	bv_usedspace;
131 	/*
132 	 * How much additional space would be occupied without block cloning.
133 	 */
134 	uint64_t	bv_savedspace;
135 	/*
136 	 * brt_vdev_phys needs updating on disk.
137 	 */
138 	boolean_t	bv_meta_dirty;
139 	/*
140 	 * bv_entcount[] needs updating on disk.
141 	 */
142 	boolean_t	bv_entcount_dirty;
143 	/*
144 	 * bv_entcount[] potentially can be a bit too big to sychronize it all
145 	 * when we just changed few entcounts. The fields below allow us to
146 	 * track updates to bv_entcount[] array since the last sync.
147 	 * A single bit in the bv_bitmap represents as many entcounts as can
148 	 * fit into a single BRT_BLOCKSIZE.
149 	 * For example we have 65536 entcounts in the bv_entcount array
150 	 * (so the whole array is 128kB). We updated bv_entcount[2] and
151 	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
152 	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
153 	 */
154 	ulong_t		*bv_bitmap;
155 	uint64_t	bv_nblocks;
156 } brt_vdev_t;
157 
158 /*
159  * In-core brt
160  */
161 typedef struct brt {
162 	krwlock_t	brt_lock;
163 	spa_t		*brt_spa;
164 #define	brt_mos		brt_spa->spa_meta_objset
165 	uint64_t	brt_rangesize;
166 	uint64_t	brt_usedspace;
167 	uint64_t	brt_savedspace;
168 	avl_tree_t	brt_pending_tree[TXG_SIZE];
169 	kmutex_t	brt_pending_lock[TXG_SIZE];
170 	/* Sum of all entries across all bv_trees. */
171 	uint64_t	brt_nentries;
172 	brt_vdev_t	*brt_vdevs;
173 	uint64_t	brt_nvdevs;
174 } brt_t;
175 
176 /* Size of bre_offset / sizeof (uint64_t). */
177 #define	BRT_KEY_WORDS	(1)
178 
179 /*
180  * In-core brt entry.
181  * On-disk we use bre_offset as the key and bre_refcount as the value.
182  */
183 typedef struct brt_entry {
184 	uint64_t	bre_offset;
185 	uint64_t	bre_refcount;
186 	avl_node_t	bre_node;
187 } brt_entry_t;
188 
189 typedef struct brt_pending_entry {
190 	blkptr_t	bpe_bp;
191 	int		bpe_count;
192 	avl_node_t	bpe_node;
193 } brt_pending_entry_t;
194 
195 #ifdef	__cplusplus
196 }
197 #endif
198 
199 #endif	/* _SYS_BRT_IMPL_H */
200