1*a63188c8STomohiro Kusumi /*
2*a63188c8STomohiro Kusumi  * SPDX-License-Identifier: BSD-3-Clause
3*a63188c8STomohiro Kusumi  *
4*a63188c8STomohiro Kusumi  * Copyright (c) 2023 Tomohiro Kusumi <tkusumi@netbsd.org>
5*a63188c8STomohiro Kusumi  * Copyright (c) 2011-2023 The DragonFly Project.  All rights reserved.
6*a63188c8STomohiro Kusumi  *
7*a63188c8STomohiro Kusumi  * This code is derived from software contributed to The DragonFly Project
8*a63188c8STomohiro Kusumi  * by Matthew Dillon <dillon@dragonflybsd.org>
9*a63188c8STomohiro Kusumi  *
10*a63188c8STomohiro Kusumi  * Redistribution and use in source and binary forms, with or without
11*a63188c8STomohiro Kusumi  * modification, are permitted provided that the following conditions
12*a63188c8STomohiro Kusumi  * are met:
13*a63188c8STomohiro Kusumi  *
14*a63188c8STomohiro Kusumi  * 1. Redistributions of source code must retain the above copyright
15*a63188c8STomohiro Kusumi  *    notice, this list of conditions and the following disclaimer.
16*a63188c8STomohiro Kusumi  * 2. Redistributions in binary form must reproduce the above copyright
17*a63188c8STomohiro Kusumi  *    notice, this list of conditions and the following disclaimer in
18*a63188c8STomohiro Kusumi  *    the documentation and/or other materials provided with the
19*a63188c8STomohiro Kusumi  *    distribution.
20*a63188c8STomohiro Kusumi  * 3. Neither the name of The DragonFly Project nor the names of its
21*a63188c8STomohiro Kusumi  *    contributors may be used to endorse or promote products derived
22*a63188c8STomohiro Kusumi  *    from this software without specific, prior written permission.
23*a63188c8STomohiro Kusumi  *
24*a63188c8STomohiro Kusumi  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25*a63188c8STomohiro Kusumi  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26*a63188c8STomohiro Kusumi  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27*a63188c8STomohiro Kusumi  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28*a63188c8STomohiro Kusumi  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29*a63188c8STomohiro Kusumi  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30*a63188c8STomohiro Kusumi  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31*a63188c8STomohiro Kusumi  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32*a63188c8STomohiro Kusumi  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33*a63188c8STomohiro Kusumi  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34*a63188c8STomohiro Kusumi  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35*a63188c8STomohiro Kusumi  * SUCH DAMAGE.
36*a63188c8STomohiro Kusumi  */
37*a63188c8STomohiro Kusumi /*
38*a63188c8STomohiro Kusumi #include <sys/param.h>
39*a63188c8STomohiro Kusumi #include <sys/systm.h>
40*a63188c8STomohiro Kusumi #include <sys/kernel.h>
41*a63188c8STomohiro Kusumi #include <sys/proc.h>
42*a63188c8STomohiro Kusumi #include <sys/mount.h>
43*a63188c8STomohiro Kusumi #include <vm/vm_kern.h>
44*a63188c8STomohiro Kusumi #include <vm/vm_extern.h>
45*a63188c8STomohiro Kusumi */
46*a63188c8STomohiro Kusumi 
47*a63188c8STomohiro Kusumi #include "hammer2.h"
48*a63188c8STomohiro Kusumi 
49*a63188c8STomohiro Kusumi /*
50*a63188c8STomohiro Kusumi  * breadth-first search
51*a63188c8STomohiro Kusumi  */
52*a63188c8STomohiro Kusumi typedef struct hammer2_chain_save {
53*a63188c8STomohiro Kusumi 	TAILQ_ENTRY(hammer2_chain_save)	entry;
54*a63188c8STomohiro Kusumi 	hammer2_chain_t	*chain;
55*a63188c8STomohiro Kusumi } hammer2_chain_save_t;
56*a63188c8STomohiro Kusumi 
57*a63188c8STomohiro Kusumi TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save);
58*a63188c8STomohiro Kusumi typedef struct hammer2_chain_save_list hammer2_chain_save_list_t;
59*a63188c8STomohiro Kusumi 
60*a63188c8STomohiro Kusumi typedef struct hammer2_bulkfree_info {
61*a63188c8STomohiro Kusumi 	hammer2_dev_t		*hmp;
62*a63188c8STomohiro Kusumi 	//kmem_anon_desc_t	kp;
63*a63188c8STomohiro Kusumi 	hammer2_off_t		sbase;		/* sub-loop iteration */
64*a63188c8STomohiro Kusumi 	hammer2_off_t		sstop;
65*a63188c8STomohiro Kusumi 	hammer2_bmap_data_t	*bmap;
66*a63188c8STomohiro Kusumi 	int			depth;
67*a63188c8STomohiro Kusumi 	long			count_10_00;	/* staged->free	     */
68*a63188c8STomohiro Kusumi 	long			count_11_10;	/* allocated->staged */
69*a63188c8STomohiro Kusumi 	long			count_00_11;	/* (should not happen) */
70*a63188c8STomohiro Kusumi 	long			count_01_11;	/* (should not happen) */
71*a63188c8STomohiro Kusumi 	long			count_10_11;	/* staged->allocated */
72*a63188c8STomohiro Kusumi 	long			count_l0cleans;
73*a63188c8STomohiro Kusumi 	long			count_linadjusts;
74*a63188c8STomohiro Kusumi 	long			count_inodes_scanned;
75*a63188c8STomohiro Kusumi 	long			count_dirents_scanned;
76*a63188c8STomohiro Kusumi 	long			count_dedup_factor;
77*a63188c8STomohiro Kusumi 	long			count_bytes_scanned;
78*a63188c8STomohiro Kusumi 	long			count_chains_scanned;
79*a63188c8STomohiro Kusumi 	long			count_chains_reported;
80*a63188c8STomohiro Kusumi 	long			bulkfree_calls;
81*a63188c8STomohiro Kusumi 	int			bulkfree_ticks;
82*a63188c8STomohiro Kusumi 	int			list_alert;
83*a63188c8STomohiro Kusumi 	hammer2_off_t		adj_free;
84*a63188c8STomohiro Kusumi 	hammer2_tid_t		mtid;
85*a63188c8STomohiro Kusumi 	time_t			save_time;
86*a63188c8STomohiro Kusumi 	hammer2_chain_save_list_t list;
87*a63188c8STomohiro Kusumi 	long			list_count;
88*a63188c8STomohiro Kusumi 	long			list_count_max;
89*a63188c8STomohiro Kusumi 	hammer2_chain_save_t	*backout;	/* ins pt while backing out */
90*a63188c8STomohiro Kusumi 	hammer2_dedup_t		*dedup;
91*a63188c8STomohiro Kusumi 	int			pri;
92*a63188c8STomohiro Kusumi } hammer2_bulkfree_info_t;
93*a63188c8STomohiro Kusumi 
94*a63188c8STomohiro Kusumi static int h2_bulkfree_test(hammer2_bulkfree_info_t *info,
95*a63188c8STomohiro Kusumi 			hammer2_blockref_t *bref, int pri, int saved_error);
96*a63188c8STomohiro Kusumi static uint32_t bigmask_get(hammer2_bmap_data_t *bmap);
97*a63188c8STomohiro Kusumi static int bigmask_good(hammer2_bmap_data_t *bmap, uint32_t live_bigmask);
98*a63188c8STomohiro Kusumi 
99*a63188c8STomohiro Kusumi /*
100*a63188c8STomohiro Kusumi  * General bulk scan function with callback.  Called with a referenced
101*a63188c8STomohiro Kusumi  * but UNLOCKED parent.  The parent is returned in the same state.
102*a63188c8STomohiro Kusumi  */
103*a63188c8STomohiro Kusumi static
104*a63188c8STomohiro Kusumi int
hammer2_bulkfree_scan(hammer2_chain_t * parent,int (* func)(hammer2_bulkfree_info_t * info,hammer2_blockref_t * bref),hammer2_bulkfree_info_t * info)105*a63188c8STomohiro Kusumi hammer2_bulkfree_scan(hammer2_chain_t *parent,
106*a63188c8STomohiro Kusumi 		  int (*func)(hammer2_bulkfree_info_t *info,
107*a63188c8STomohiro Kusumi 			      hammer2_blockref_t *bref),
108*a63188c8STomohiro Kusumi 		  hammer2_bulkfree_info_t *info)
109*a63188c8STomohiro Kusumi {
110*a63188c8STomohiro Kusumi 	hammer2_blockref_t bref;
111*a63188c8STomohiro Kusumi 	hammer2_chain_t *chain;
112*a63188c8STomohiro Kusumi 	hammer2_chain_save_t *tail;
113*a63188c8STomohiro Kusumi 	hammer2_chain_save_t *save;
114*a63188c8STomohiro Kusumi 	int first = 1;
115*a63188c8STomohiro Kusumi 	int rup_error;
116*a63188c8STomohiro Kusumi 	int error;
117*a63188c8STomohiro Kusumi 	int e2;
118*a63188c8STomohiro Kusumi 
119*a63188c8STomohiro Kusumi 	++info->pri;
120*a63188c8STomohiro Kusumi 
121*a63188c8STomohiro Kusumi 	chain = NULL;
122*a63188c8STomohiro Kusumi 	rup_error = 0;
123*a63188c8STomohiro Kusumi 	error = 0;
124*a63188c8STomohiro Kusumi 
125*a63188c8STomohiro Kusumi 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
126*a63188c8STomohiro Kusumi 				   HAMMER2_RESOLVE_SHARED);
127*a63188c8STomohiro Kusumi 
128*a63188c8STomohiro Kusumi 	/*
129*a63188c8STomohiro Kusumi 	 * End of scan if parent is a PFS
130*a63188c8STomohiro Kusumi 	 */
131*a63188c8STomohiro Kusumi 	tail = TAILQ_FIRST(&info->list);
132*a63188c8STomohiro Kusumi 
133*a63188c8STomohiro Kusumi 	/*
134*a63188c8STomohiro Kusumi 	 * The parent was previously retrieved NODATA and thus has not
135*a63188c8STomohiro Kusumi 	 * tested the CRC.  Now that we have locked it normally, check
136*a63188c8STomohiro Kusumi 	 * for a CRC problem and skip it if we found one.  The bulk scan
137*a63188c8STomohiro Kusumi 	 * cannot safely traverse invalid block tables (we could end up
138*a63188c8STomohiro Kusumi 	 * in an endless loop or cause a panic).
139*a63188c8STomohiro Kusumi 	 */
140*a63188c8STomohiro Kusumi 	if (parent->error & HAMMER2_ERROR_CHECK) {
141*a63188c8STomohiro Kusumi 		error = parent->error;
142*a63188c8STomohiro Kusumi 		goto done;
143*a63188c8STomohiro Kusumi 	}
144*a63188c8STomohiro Kusumi 
145*a63188c8STomohiro Kusumi 	/*
146*a63188c8STomohiro Kusumi 	 * Report which PFS is being scanned
147*a63188c8STomohiro Kusumi 	 */
148*a63188c8STomohiro Kusumi 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
149*a63188c8STomohiro Kusumi 	    (parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT)) {
150*a63188c8STomohiro Kusumi 		kprintf("hammer2_bulkfree: Scanning %s\n",
151*a63188c8STomohiro Kusumi 			parent->data->ipdata.filename);
152*a63188c8STomohiro Kusumi 	}
153*a63188c8STomohiro Kusumi 
154*a63188c8STomohiro Kusumi 	/*
155*a63188c8STomohiro Kusumi 	 * Generally loop on the contents if we have not been flagged
156*a63188c8STomohiro Kusumi 	 * for abort.
157*a63188c8STomohiro Kusumi 	 *
158*a63188c8STomohiro Kusumi 	 * Remember that these chains are completely isolated from
159*a63188c8STomohiro Kusumi 	 * the frontend, so we can release locks temporarily without
160*a63188c8STomohiro Kusumi 	 * imploding.
161*a63188c8STomohiro Kusumi 	 */
162*a63188c8STomohiro Kusumi 	for (;;) {
163*a63188c8STomohiro Kusumi 		error |= hammer2_chain_scan(parent, &chain, &bref, &first,
164*a63188c8STomohiro Kusumi 					    HAMMER2_LOOKUP_NODATA |
165*a63188c8STomohiro Kusumi 					    HAMMER2_LOOKUP_SHARED);
166*a63188c8STomohiro Kusumi 
167*a63188c8STomohiro Kusumi 		/*
168*a63188c8STomohiro Kusumi 		 * Handle EOF or other error at current level.  This stops
169*a63188c8STomohiro Kusumi 		 * the bulkfree scan.
170*a63188c8STomohiro Kusumi 		 */
171*a63188c8STomohiro Kusumi 		if (error & ~HAMMER2_ERROR_CHECK)
172*a63188c8STomohiro Kusumi 			break;
173*a63188c8STomohiro Kusumi 
174*a63188c8STomohiro Kusumi 		/*
175*a63188c8STomohiro Kusumi 		 * Account for dirents before thre data_off test, since most
176*a63188c8STomohiro Kusumi 		 * dirents do not need a data reference.
177*a63188c8STomohiro Kusumi 		 */
178*a63188c8STomohiro Kusumi 		if (bref.type == HAMMER2_BREF_TYPE_DIRENT)
179*a63188c8STomohiro Kusumi 			++info->count_dirents_scanned;
180*a63188c8STomohiro Kusumi 
181*a63188c8STomohiro Kusumi 		/*
182*a63188c8STomohiro Kusumi 		 * Ignore brefs without data (typically dirents)
183*a63188c8STomohiro Kusumi 		 */
184*a63188c8STomohiro Kusumi 		if ((bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
185*a63188c8STomohiro Kusumi 			continue;
186*a63188c8STomohiro Kusumi 
187*a63188c8STomohiro Kusumi 		/*
188*a63188c8STomohiro Kusumi 		 * Process bref, chain is only non-NULL if the bref
189*a63188c8STomohiro Kusumi 		 * might be recursable (its possible that we sometimes get
190*a63188c8STomohiro Kusumi 		 * a non-NULL chain where the bref cannot be recursed).
191*a63188c8STomohiro Kusumi 		 *
192*a63188c8STomohiro Kusumi 		 * If we already ran down this tree we do not have to do it
193*a63188c8STomohiro Kusumi 		 * again, but we must still recover any cumulative error
194*a63188c8STomohiro Kusumi 		 * recorded from the time we did.
195*a63188c8STomohiro Kusumi 		 */
196*a63188c8STomohiro Kusumi 		++info->pri;
197*a63188c8STomohiro Kusumi 		e2 = h2_bulkfree_test(info, &bref, 1, 0);
198*a63188c8STomohiro Kusumi 		if (e2) {
199*a63188c8STomohiro Kusumi 			error |= e2 & ~HAMMER2_ERROR_EOF;
200*a63188c8STomohiro Kusumi 			continue;
201*a63188c8STomohiro Kusumi 		}
202*a63188c8STomohiro Kusumi 
203*a63188c8STomohiro Kusumi 		if (bref.type == HAMMER2_BREF_TYPE_INODE)
204*a63188c8STomohiro Kusumi 			++info->count_inodes_scanned;
205*a63188c8STomohiro Kusumi 
206*a63188c8STomohiro Kusumi 		error |= func(info, &bref);
207*a63188c8STomohiro Kusumi 		if (error & ~HAMMER2_ERROR_CHECK)
208*a63188c8STomohiro Kusumi 			break;
209*a63188c8STomohiro Kusumi 
210*a63188c8STomohiro Kusumi 		/*
211*a63188c8STomohiro Kusumi 		 * A non-null chain is always returned if it is
212*a63188c8STomohiro Kusumi 		 * recursive, otherwise a non-null chain might be
213*a63188c8STomohiro Kusumi 		 * returned but usually is not when not recursive.
214*a63188c8STomohiro Kusumi 		 */
215*a63188c8STomohiro Kusumi 		if (chain == NULL)
216*a63188c8STomohiro Kusumi 			continue;
217*a63188c8STomohiro Kusumi 
218*a63188c8STomohiro Kusumi 		info->count_bytes_scanned += chain->bytes;
219*a63188c8STomohiro Kusumi 		++info->count_chains_scanned;
220*a63188c8STomohiro Kusumi 
221*a63188c8STomohiro Kusumi 		if (info->count_chains_scanned >=
222*a63188c8STomohiro Kusumi 		    info->count_chains_reported + 1000000 ||
223*a63188c8STomohiro Kusumi 		    (info->count_chains_scanned < 1000000 &&
224*a63188c8STomohiro Kusumi 		     info->count_chains_scanned >=
225*a63188c8STomohiro Kusumi 		     info->count_chains_reported + 100000)) {
226*a63188c8STomohiro Kusumi 			kprintf(" chains %-7ld inodes %-7ld "
227*a63188c8STomohiro Kusumi 				"dirents %-7ld bytes %5ldMB\n",
228*a63188c8STomohiro Kusumi 				info->count_chains_scanned,
229*a63188c8STomohiro Kusumi 				info->count_inodes_scanned,
230*a63188c8STomohiro Kusumi 				info->count_dirents_scanned,
231*a63188c8STomohiro Kusumi 				info->count_bytes_scanned / 1000000);
232*a63188c8STomohiro Kusumi 			info->count_chains_reported =
233*a63188c8STomohiro Kusumi 				info->count_chains_scanned;
234*a63188c8STomohiro Kusumi 		}
235*a63188c8STomohiro Kusumi 
236*a63188c8STomohiro Kusumi 		/*
237*a63188c8STomohiro Kusumi 		 * Else check type and setup depth-first scan.
238*a63188c8STomohiro Kusumi 		 *
239*a63188c8STomohiro Kusumi 		 * Account for bytes actually read.
240*a63188c8STomohiro Kusumi 		 */
241*a63188c8STomohiro Kusumi 		switch(chain->bref.type) {
242*a63188c8STomohiro Kusumi 		case HAMMER2_BREF_TYPE_INODE:
243*a63188c8STomohiro Kusumi 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
244*a63188c8STomohiro Kusumi 		case HAMMER2_BREF_TYPE_INDIRECT:
245*a63188c8STomohiro Kusumi 		case HAMMER2_BREF_TYPE_VOLUME:
246*a63188c8STomohiro Kusumi 		case HAMMER2_BREF_TYPE_FREEMAP:
247*a63188c8STomohiro Kusumi 			++info->depth;
248*a63188c8STomohiro Kusumi 			if (chain->error & HAMMER2_ERROR_CHECK) {
249*a63188c8STomohiro Kusumi 				/*
250*a63188c8STomohiro Kusumi 				 * Cannot safely recurse chains with crc
251*a63188c8STomohiro Kusumi 				 * errors, even in emergency mode.
252*a63188c8STomohiro Kusumi 				 */
253*a63188c8STomohiro Kusumi 				/* NOP */
254*a63188c8STomohiro Kusumi 			} else if (info->depth > 16 ||
255*a63188c8STomohiro Kusumi 				   info->backout ||
256*a63188c8STomohiro Kusumi 				   (info->depth > hammer2_limit_saved_depth &&
257*a63188c8STomohiro Kusumi 				   info->list_count >=
258*a63188c8STomohiro Kusumi 				    (hammer2_limit_saved_chains >> 2)))
259*a63188c8STomohiro Kusumi 			{
260*a63188c8STomohiro Kusumi 				/*
261*a63188c8STomohiro Kusumi 				 * We must defer the recursion if it runs
262*a63188c8STomohiro Kusumi 				 * too deep or if too many saved chains are
263*a63188c8STomohiro Kusumi 				 * allocated.
264*a63188c8STomohiro Kusumi 				 *
265*a63188c8STomohiro Kusumi 				 * In the case of too many saved chains, we
266*a63188c8STomohiro Kusumi 				 * have to stop recursing ASAP to avoid an
267*a63188c8STomohiro Kusumi 				 * explosion of memory use since each radix
268*a63188c8STomohiro Kusumi 				 * level can hold 512 elements.
269*a63188c8STomohiro Kusumi 				 *
270*a63188c8STomohiro Kusumi 				 * If we had to defer at a deeper level
271*a63188c8STomohiro Kusumi 				 * backout is non-NULL.  We must backout
272*a63188c8STomohiro Kusumi 				 * completely before resuming.
273*a63188c8STomohiro Kusumi 				 */
274*a63188c8STomohiro Kusumi 				if (info->list_count >
275*a63188c8STomohiro Kusumi 				     hammer2_limit_saved_chains &&
276*a63188c8STomohiro Kusumi 				    info->list_alert == 0)
277*a63188c8STomohiro Kusumi 				{
278*a63188c8STomohiro Kusumi 					kprintf("hammer2: during bulkfree, "
279*a63188c8STomohiro Kusumi 						"saved chains exceeded %ld "
280*a63188c8STomohiro Kusumi 						"at depth %d, "
281*a63188c8STomohiro Kusumi 						"backing off to less-efficient "
282*a63188c8STomohiro Kusumi 						"operation\n",
283*a63188c8STomohiro Kusumi 						hammer2_limit_saved_chains,
284*a63188c8STomohiro Kusumi 						info->depth);
285*a63188c8STomohiro Kusumi 					info->list_alert = 1;
286*a63188c8STomohiro Kusumi 				}
287*a63188c8STomohiro Kusumi 
288*a63188c8STomohiro Kusumi 				/*
289*a63188c8STomohiro Kusumi 				 * Must be placed at head so pfsroot scan
290*a63188c8STomohiro Kusumi 				 * can exhaust saved elements for that pfs
291*a63188c8STomohiro Kusumi 				 * first.
292*a63188c8STomohiro Kusumi 				 *
293*a63188c8STomohiro Kusumi 				 * Must be placed at head for depth-first
294*a63188c8STomohiro Kusumi 				 * recovery when too many saved chains, to
295*a63188c8STomohiro Kusumi 				 * limit number of chains saved during
296*a63188c8STomohiro Kusumi 				 * saved-chain reruns.  The worst-case excess
297*a63188c8STomohiro Kusumi 				 * is (maximum_depth * 512) saved chains above
298*a63188c8STomohiro Kusumi 				 * the threshold.
299*a63188c8STomohiro Kusumi 				 *
300*a63188c8STomohiro Kusumi 				 * The maximum_depth generally occurs in the
301*a63188c8STomohiro Kusumi 				 * inode index and can be fairly deep once
302*a63188c8STomohiro Kusumi 				 * the radix tree becomes a bit fragmented.
303*a63188c8STomohiro Kusumi 				 * nominally 100M inodes would be only 4 deep,
304*a63188c8STomohiro Kusumi 				 * plus a maximally sized file would be another
305*a63188c8STomohiro Kusumi 				 * 8 deep, but with fragmentation it can wind
306*a63188c8STomohiro Kusumi 				 * up being a lot more.
307*a63188c8STomohiro Kusumi 				 *
308*a63188c8STomohiro Kusumi 				 * However, when backing out, we have to place
309*a63188c8STomohiro Kusumi 				 * all the entries in each parent node not
310*a63188c8STomohiro Kusumi 				 * yet processed on the list too, and because
311*a63188c8STomohiro Kusumi 				 * these entries are shallower they must be
312*a63188c8STomohiro Kusumi 				 * placed after each other in order to maintain
313*a63188c8STomohiro Kusumi 				 * our depth-first processing.
314*a63188c8STomohiro Kusumi 				 */
315*a63188c8STomohiro Kusumi 				save = kmalloc(sizeof(*save), M_HAMMER2,
316*a63188c8STomohiro Kusumi 					       M_WAITOK | M_ZERO);
317*a63188c8STomohiro Kusumi 				save->chain = chain;
318*a63188c8STomohiro Kusumi 				hammer2_chain_ref(chain);
319*a63188c8STomohiro Kusumi 
320*a63188c8STomohiro Kusumi 				if (info->backout) {
321*a63188c8STomohiro Kusumi 					TAILQ_INSERT_AFTER(&info->list,
322*a63188c8STomohiro Kusumi 							   info->backout,
323*a63188c8STomohiro Kusumi 							   save, entry);
324*a63188c8STomohiro Kusumi 				} else {
325*a63188c8STomohiro Kusumi 					TAILQ_INSERT_HEAD(&info->list,
326*a63188c8STomohiro Kusumi 							  save, entry);
327*a63188c8STomohiro Kusumi 				}
328*a63188c8STomohiro Kusumi 				info->backout = save;
329*a63188c8STomohiro Kusumi 				++info->list_count;
330*a63188c8STomohiro Kusumi 				if (info->list_count_max < info->list_count)
331*a63188c8STomohiro Kusumi 					info->list_count_max = info->list_count;
332*a63188c8STomohiro Kusumi 
333*a63188c8STomohiro Kusumi 				/* guess */
334*a63188c8STomohiro Kusumi 				info->pri += 10;
335*a63188c8STomohiro Kusumi 			} else {
336*a63188c8STomohiro Kusumi 				int savepri = info->pri;
337*a63188c8STomohiro Kusumi 
338*a63188c8STomohiro Kusumi 				hammer2_chain_unlock(chain);
339*a63188c8STomohiro Kusumi 				hammer2_chain_unlock(parent);
340*a63188c8STomohiro Kusumi 				info->pri = 0;
341*a63188c8STomohiro Kusumi 				rup_error |= hammer2_bulkfree_scan(chain,
342*a63188c8STomohiro Kusumi 								   func, info);
343*a63188c8STomohiro Kusumi 				info->pri += savepri;
344*a63188c8STomohiro Kusumi 				hammer2_chain_lock(parent,
345*a63188c8STomohiro Kusumi 						   HAMMER2_RESOLVE_ALWAYS |
346*a63188c8STomohiro Kusumi 						   HAMMER2_RESOLVE_SHARED);
347*a63188c8STomohiro Kusumi 				hammer2_chain_lock(chain,
348*a63188c8STomohiro Kusumi 						   HAMMER2_RESOLVE_ALWAYS |
349*a63188c8STomohiro Kusumi 						   HAMMER2_RESOLVE_SHARED);
350*a63188c8STomohiro Kusumi 			}
351*a63188c8STomohiro Kusumi 			--info->depth;
352*a63188c8STomohiro Kusumi 			break;
353*a63188c8STomohiro Kusumi 		case HAMMER2_BREF_TYPE_DATA:
354*a63188c8STomohiro Kusumi 			break;
355*a63188c8STomohiro Kusumi 		default:
356*a63188c8STomohiro Kusumi 			/* does not recurse */
357*a63188c8STomohiro Kusumi 			break;
358*a63188c8STomohiro Kusumi 		}
359*a63188c8STomohiro Kusumi 		if (rup_error & HAMMER2_ERROR_ABORTED)
360*a63188c8STomohiro Kusumi 			break;
361*a63188c8STomohiro Kusumi 	}
362*a63188c8STomohiro Kusumi 	if (chain) {
363*a63188c8STomohiro Kusumi 		hammer2_chain_unlock(chain);
364*a63188c8STomohiro Kusumi 		hammer2_chain_drop(chain);
365*a63188c8STomohiro Kusumi 	}
366*a63188c8STomohiro Kusumi 
367*a63188c8STomohiro Kusumi 	/*
368*a63188c8STomohiro Kusumi 	 * If this is a PFSROOT, also re-run any defered elements
369*a63188c8STomohiro Kusumi 	 * added during our scan so we can report any cumulative errors
370*a63188c8STomohiro Kusumi 	 * for the PFS.
371*a63188c8STomohiro Kusumi 	 */
372*a63188c8STomohiro Kusumi 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
373*a63188c8STomohiro Kusumi 	    (parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT)) {
374*a63188c8STomohiro Kusumi 		for (;;) {
375*a63188c8STomohiro Kusumi 			int opri;
376*a63188c8STomohiro Kusumi 
377*a63188c8STomohiro Kusumi 			save = TAILQ_FIRST(&info->list);
378*a63188c8STomohiro Kusumi 			if (save == tail)	/* exhaust this PFS only */
379*a63188c8STomohiro Kusumi 				break;
380*a63188c8STomohiro Kusumi 
381*a63188c8STomohiro Kusumi 			TAILQ_REMOVE(&info->list, save, entry);
382*a63188c8STomohiro Kusumi 			info->backout = NULL;
383*a63188c8STomohiro Kusumi 			--info->list_count;
384*a63188c8STomohiro Kusumi 			opri = info->pri;
385*a63188c8STomohiro Kusumi 			info->pri = 0;
386*a63188c8STomohiro Kusumi 			rup_error |= hammer2_bulkfree_scan(save->chain, func, info);
387*a63188c8STomohiro Kusumi 			hammer2_chain_drop(save->chain);
388*a63188c8STomohiro Kusumi 			kfree(save, M_HAMMER2);
389*a63188c8STomohiro Kusumi 			info->pri = opri;
390*a63188c8STomohiro Kusumi 		}
391*a63188c8STomohiro Kusumi 	}
392*a63188c8STomohiro Kusumi 
393*a63188c8STomohiro Kusumi 	error |= rup_error;
394*a63188c8STomohiro Kusumi 
395*a63188c8STomohiro Kusumi 	/*
396*a63188c8STomohiro Kusumi 	 * Report which PFS the errors were encountered in.
397*a63188c8STomohiro Kusumi 	 */
398*a63188c8STomohiro Kusumi 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
399*a63188c8STomohiro Kusumi 	    (parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
400*a63188c8STomohiro Kusumi 	    (error & ~HAMMER2_ERROR_EOF)) {
401*a63188c8STomohiro Kusumi 		kprintf("hammer2_bulkfree: Encountered errors (%08x) "
402*a63188c8STomohiro Kusumi 			"while scanning \"%s\"\n",
403*a63188c8STomohiro Kusumi 			error, parent->data->ipdata.filename);
404*a63188c8STomohiro Kusumi 	}
405*a63188c8STomohiro Kusumi 
406*a63188c8STomohiro Kusumi 	/*
407*a63188c8STomohiro Kusumi 	 * Save with higher pri now that we know what it is.
408*a63188c8STomohiro Kusumi 	 */
409*a63188c8STomohiro Kusumi 	h2_bulkfree_test(info, &parent->bref, info->pri + 1,
410*a63188c8STomohiro Kusumi 			 (error & ~HAMMER2_ERROR_EOF));
411*a63188c8STomohiro Kusumi 
412*a63188c8STomohiro Kusumi done:
413*a63188c8STomohiro Kusumi 	hammer2_chain_unlock(parent);
414*a63188c8STomohiro Kusumi 
415*a63188c8STomohiro Kusumi 	return (error & ~HAMMER2_ERROR_EOF);
416*a63188c8STomohiro Kusumi }
417*a63188c8STomohiro Kusumi 
418*a63188c8STomohiro Kusumi /*
419*a63188c8STomohiro Kusumi  * Bulkfree algorithm
420*a63188c8STomohiro Kusumi  *
421*a63188c8STomohiro Kusumi  * Repeat {
422*a63188c8STomohiro Kusumi  *	Chain flush (partial synchronization) XXX removed
423*a63188c8STomohiro Kusumi  *	Scan the whole topology - build in-memory freemap (mark 11)
424*a63188c8STomohiro Kusumi  *	Reconcile the in-memory freemap against the on-disk freemap.
425*a63188c8STomohiro Kusumi  *		ondisk xx -> ondisk 11 (if allocated)
426*a63188c8STomohiro Kusumi  *		ondisk 11 -> ondisk 10 (if free in-memory)
427*a63188c8STomohiro Kusumi  *		ondisk 10 -> ondisk 00 (if free in-memory) - on next pass
428*a63188c8STomohiro Kusumi  * }
429*a63188c8STomohiro Kusumi  *
430*a63188c8STomohiro Kusumi  * The topology scan may have to be performed multiple times to window
431*a63188c8STomohiro Kusumi  * freemaps which are too large to fit in kernel memory.
432*a63188c8STomohiro Kusumi  *
433*a63188c8STomohiro Kusumi  * Races are handled using a double-transition (11->10, 10->00).  The bulkfree
434*a63188c8STomohiro Kusumi  * scan snapshots the volume root's blockset and thus can run concurrent with
435*a63188c8STomohiro Kusumi  * normal operations, as long as a full flush is made between each pass to
436*a63188c8STomohiro Kusumi  * synchronize any modified chains (otherwise their blocks might be improperly
437*a63188c8STomohiro Kusumi  * freed).
438*a63188c8STomohiro Kusumi  *
439*a63188c8STomohiro Kusumi  * Temporary memory in multiples of 32KB is required to reconstruct the leaf
440*a63188c8STomohiro Kusumi  * hammer2_bmap_data blocks so they can later be compared against the live
441*a63188c8STomohiro Kusumi  * freemap.  Each 32KB represents 256 x 16KB x 256 = ~1 GB of storage.
442*a63188c8STomohiro Kusumi  * A 32MB save area thus represents around ~1 TB.  The temporary memory
443*a63188c8STomohiro Kusumi  * allocated can be specified.  If it is not sufficient multiple topology
444*a63188c8STomohiro Kusumi  * passes will be made.
445*a63188c8STomohiro Kusumi  */
446*a63188c8STomohiro Kusumi 
447*a63188c8STomohiro Kusumi /*
448*a63188c8STomohiro Kusumi  * Bulkfree callback info
449*a63188c8STomohiro Kusumi  */
450*a63188c8STomohiro Kusumi //static void hammer2_bulkfree_thread(void *arg __unused);
451*a63188c8STomohiro Kusumi static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size);
452*a63188c8STomohiro Kusumi static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo,
453*a63188c8STomohiro Kusumi 			hammer2_blockref_t *bref);
454*a63188c8STomohiro Kusumi static int h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo);
455*a63188c8STomohiro Kusumi static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
456*a63188c8STomohiro Kusumi 			hammer2_off_t data_off, hammer2_bmap_data_t *live,
457*a63188c8STomohiro Kusumi 			hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base);
458*a63188c8STomohiro Kusumi 
459*a63188c8STomohiro Kusumi void
hammer2_bulkfree_init(hammer2_dev_t * hmp)460*a63188c8STomohiro Kusumi hammer2_bulkfree_init(hammer2_dev_t *hmp)
461*a63188c8STomohiro Kusumi {
462*a63188c8STomohiro Kusumi 	/*
463*a63188c8STomohiro Kusumi 	hammer2_thr_create(&hmp->bfthr, NULL, hmp,
464*a63188c8STomohiro Kusumi 			   hmp->devrepname, -1, -1,
465*a63188c8STomohiro Kusumi 			   hammer2_bulkfree_thread);
466*a63188c8STomohiro Kusumi 	*/
467*a63188c8STomohiro Kusumi }
468*a63188c8STomohiro Kusumi 
469*a63188c8STomohiro Kusumi void
hammer2_bulkfree_uninit(hammer2_dev_t * hmp)470*a63188c8STomohiro Kusumi hammer2_bulkfree_uninit(hammer2_dev_t *hmp)
471*a63188c8STomohiro Kusumi {
472*a63188c8STomohiro Kusumi 	//hammer2_thr_delete(&hmp->bfthr);
473*a63188c8STomohiro Kusumi }
474*a63188c8STomohiro Kusumi 
475*a63188c8STomohiro Kusumi #if 0
476*a63188c8STomohiro Kusumi static void
477*a63188c8STomohiro Kusumi hammer2_bulkfree_thread(void *arg)
478*a63188c8STomohiro Kusumi {
479*a63188c8STomohiro Kusumi 	hammer2_thread_t *thr = arg;
480*a63188c8STomohiro Kusumi 	hammer2_ioc_bulkfree_t bfi;
481*a63188c8STomohiro Kusumi 	uint32_t flags;
482*a63188c8STomohiro Kusumi 
483*a63188c8STomohiro Kusumi 	for (;;) {
484*a63188c8STomohiro Kusumi 		hammer2_thr_wait_any(thr,
485*a63188c8STomohiro Kusumi 				     HAMMER2_THREAD_STOP |
486*a63188c8STomohiro Kusumi 				     HAMMER2_THREAD_FREEZE |
487*a63188c8STomohiro Kusumi 				     HAMMER2_THREAD_UNFREEZE |
488*a63188c8STomohiro Kusumi 				     HAMMER2_THREAD_REMASTER,
489*a63188c8STomohiro Kusumi 				     hz * 60);
490*a63188c8STomohiro Kusumi 
491*a63188c8STomohiro Kusumi 		flags = thr->flags;
492*a63188c8STomohiro Kusumi 		cpu_ccfence();
493*a63188c8STomohiro Kusumi 		if (flags & HAMMER2_THREAD_STOP)
494*a63188c8STomohiro Kusumi 			break;
495*a63188c8STomohiro Kusumi 		if (flags & HAMMER2_THREAD_FREEZE) {
496*a63188c8STomohiro Kusumi 			hammer2_thr_signal2(thr, HAMMER2_THREAD_FROZEN,
497*a63188c8STomohiro Kusumi 						 HAMMER2_THREAD_FREEZE);
498*a63188c8STomohiro Kusumi 			continue;
499*a63188c8STomohiro Kusumi 		}
500*a63188c8STomohiro Kusumi 		if (flags & HAMMER2_THREAD_UNFREEZE) {
501*a63188c8STomohiro Kusumi 			hammer2_thr_signal2(thr, 0,
502*a63188c8STomohiro Kusumi 						 HAMMER2_THREAD_FROZEN |
503*a63188c8STomohiro Kusumi 						 HAMMER2_THREAD_UNFREEZE);
504*a63188c8STomohiro Kusumi 			continue;
505*a63188c8STomohiro Kusumi 		}
506*a63188c8STomohiro Kusumi 		if (flags & HAMMER2_THREAD_FROZEN)
507*a63188c8STomohiro Kusumi 			continue;
508*a63188c8STomohiro Kusumi 		if (flags & HAMMER2_THREAD_REMASTER) {
509*a63188c8STomohiro Kusumi 			hammer2_thr_signal2(thr, 0, HAMMER2_THREAD_REMASTER);
510*a63188c8STomohiro Kusumi 			bzero(&bfi, sizeof(bfi));
511*a63188c8STomohiro Kusumi 			bfi.size = 8192 * 1024;
512*a63188c8STomohiro Kusumi 			/* hammer2_bulkfree_pass(thr->hmp, &bfi); */
513*a63188c8STomohiro Kusumi 		}
514*a63188c8STomohiro Kusumi 	}
515*a63188c8STomohiro Kusumi 	thr->td = NULL;
516*a63188c8STomohiro Kusumi 	hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
517*a63188c8STomohiro Kusumi 	/* structure can go invalid at this point */
518*a63188c8STomohiro Kusumi }
519*a63188c8STomohiro Kusumi #endif
520*a63188c8STomohiro Kusumi 
521*a63188c8STomohiro Kusumi int
hammer2_bulkfree_pass(hammer2_dev_t * hmp,hammer2_chain_t * vchain,hammer2_ioc_bulkfree_t * bfi)522*a63188c8STomohiro Kusumi hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_chain_t *vchain,
523*a63188c8STomohiro Kusumi 		      hammer2_ioc_bulkfree_t *bfi)
524*a63188c8STomohiro Kusumi {
525*a63188c8STomohiro Kusumi 	hammer2_bulkfree_info_t cbinfo;
526*a63188c8STomohiro Kusumi 	hammer2_chain_save_t *save;
527*a63188c8STomohiro Kusumi 	hammer2_off_t incr;
528*a63188c8STomohiro Kusumi 	size_t size;
529*a63188c8STomohiro Kusumi 	int error;
530*a63188c8STomohiro Kusumi 
531*a63188c8STomohiro Kusumi 	/*
532*a63188c8STomohiro Kusumi 	 * We have to clear the live dedup cache as it might have entries
533*a63188c8STomohiro Kusumi 	 * that are freeable as of now.  Any new entries in the dedup cache
534*a63188c8STomohiro Kusumi 	 * made after this point, even if they become freeable, will have
535*a63188c8STomohiro Kusumi 	 * previously been fully allocated and will be protected by the
536*a63188c8STomohiro Kusumi 	 * 2-stage bulkfree.
537*a63188c8STomohiro Kusumi 	 */
538*a63188c8STomohiro Kusumi 	hammer2_dedup_clear(hmp);
539*a63188c8STomohiro Kusumi 
540*a63188c8STomohiro Kusumi 	/*
541*a63188c8STomohiro Kusumi 	 * Setup for free pass using the buffer size specified by the
542*a63188c8STomohiro Kusumi 	 * hammer2 utility, 32K-aligned.
543*a63188c8STomohiro Kusumi 	 */
544*a63188c8STomohiro Kusumi 	bzero(&cbinfo, sizeof(cbinfo));
545*a63188c8STomohiro Kusumi 	size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) &
546*a63188c8STomohiro Kusumi 	       ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1);
547*a63188c8STomohiro Kusumi 
548*a63188c8STomohiro Kusumi 	/*
549*a63188c8STomohiro Kusumi 	 * Cap at 1/4 physical memory (hammer2 utility will not normally
550*a63188c8STomohiro Kusumi 	 * ever specify a buffer this big, but leave the option available).
551*a63188c8STomohiro Kusumi 	 */
552*a63188c8STomohiro Kusumi 	/*
553*a63188c8STomohiro Kusumi 	if (size > kmem_lim_size() * 1024 * 1024 / 4) {
554*a63188c8STomohiro Kusumi 		size = kmem_lim_size() * 1024 * 1024 / 4;
555*a63188c8STomohiro Kusumi 		kprintf("hammer2: Warning: capping bulkfree buffer at %jdM\n",
556*a63188c8STomohiro Kusumi 			(intmax_t)size / (1024 * 1024));
557*a63188c8STomohiro Kusumi 	}
558*a63188c8STomohiro Kusumi 	*/
559*a63188c8STomohiro Kusumi 
560*a63188c8STomohiro Kusumi #define HAMMER2_FREEMAP_SIZEDIV	\
561*a63188c8STomohiro Kusumi 	(HAMMER2_FREEMAP_LEVEL1_SIZE / HAMMER2_FREEMAP_LEVELN_PSIZE)
562*a63188c8STomohiro Kusumi 
563*a63188c8STomohiro Kusumi 	/*
564*a63188c8STomohiro Kusumi 	 * Cap at the size needed to cover the whole volume to avoid
565*a63188c8STomohiro Kusumi 	 * making an unnecessarily large allocation.
566*a63188c8STomohiro Kusumi 	 */
567*a63188c8STomohiro Kusumi 	if (size > hmp->total_size / HAMMER2_FREEMAP_SIZEDIV)
568*a63188c8STomohiro Kusumi 		size = howmany(hmp->total_size, HAMMER2_FREEMAP_SIZEDIV);
569*a63188c8STomohiro Kusumi 
570*a63188c8STomohiro Kusumi 	/*
571*a63188c8STomohiro Kusumi 	 * Minimum bitmap buffer size, then align to a LEVELN_PSIZE (32K)
572*a63188c8STomohiro Kusumi 	 * boundary.
573*a63188c8STomohiro Kusumi 	 */
574*a63188c8STomohiro Kusumi 	if (size < 1024 * 1024)
575*a63188c8STomohiro Kusumi 		size = 1024 * 1024;
576*a63188c8STomohiro Kusumi 	size = (size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) &
577*a63188c8STomohiro Kusumi 	       ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1);
578*a63188c8STomohiro Kusumi 
579*a63188c8STomohiro Kusumi 	cbinfo.hmp = hmp;
580*a63188c8STomohiro Kusumi 	cbinfo.bmap = kmalloc(size, M_HAMMER2, M_WAITOK | M_ZERO);
581*a63188c8STomohiro Kusumi 	cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE,
582*a63188c8STomohiro Kusumi 			       M_HAMMER2, M_WAITOK | M_ZERO);
583*a63188c8STomohiro Kusumi 
584*a63188c8STomohiro Kusumi 	kprintf("hammer2: bulkfree buf=%jdM\n",
585*a63188c8STomohiro Kusumi 		(intmax_t)size / (1024 * 1024));
586*a63188c8STomohiro Kusumi 
587*a63188c8STomohiro Kusumi 	/*
588*a63188c8STomohiro Kusumi 	 * Normalize start point to a 1GB boundary.  We operate on a
589*a63188c8STomohiro Kusumi 	 * 32KB leaf bitmap boundary which represents 1GB of storage.
590*a63188c8STomohiro Kusumi 	 */
591*a63188c8STomohiro Kusumi 	cbinfo.sbase = bfi->sbase;
592*a63188c8STomohiro Kusumi 	if (cbinfo.sbase > hmp->total_size)
593*a63188c8STomohiro Kusumi 		cbinfo.sbase = hmp->total_size;
594*a63188c8STomohiro Kusumi 	cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK;
595*a63188c8STomohiro Kusumi 	TAILQ_INIT(&cbinfo.list);
596*a63188c8STomohiro Kusumi 
597*a63188c8STomohiro Kusumi 	cbinfo.bulkfree_ticks = ticks;
598*a63188c8STomohiro Kusumi 
599*a63188c8STomohiro Kusumi 	/*
600*a63188c8STomohiro Kusumi 	 * Loop on a full meta-data scan as many times as required to
601*a63188c8STomohiro Kusumi 	 * get through all available storage.
602*a63188c8STomohiro Kusumi 	 */
603*a63188c8STomohiro Kusumi 	error = 0;
604*a63188c8STomohiro Kusumi 	while (cbinfo.sbase < hmp->total_size) {
605*a63188c8STomohiro Kusumi 		/*
606*a63188c8STomohiro Kusumi 		 * We have enough ram to represent (incr) bytes of storage.
607*a63188c8STomohiro Kusumi 		 * Each 32KB of ram represents 1GB of storage.
608*a63188c8STomohiro Kusumi 		 *
609*a63188c8STomohiro Kusumi 		 * We must also clean out our de-duplication heuristic for
610*a63188c8STomohiro Kusumi 		 * each (incr) bytes of storage, otherwise we wind up not
611*a63188c8STomohiro Kusumi 		 * scanning meta-data for later areas of storage because
612*a63188c8STomohiro Kusumi 		 * they had already been scanned in earlier areas of storage.
613*a63188c8STomohiro Kusumi 		 * Since the ranging is different, we have to restart
614*a63188c8STomohiro Kusumi 		 * the dedup heuristic too.
615*a63188c8STomohiro Kusumi 		 */
616*a63188c8STomohiro Kusumi 		int allmedia;
617*a63188c8STomohiro Kusumi 
618*a63188c8STomohiro Kusumi 		cbinfo_bmap_init(&cbinfo, size);
619*a63188c8STomohiro Kusumi 		bzero(cbinfo.dedup, sizeof(*cbinfo.dedup) *
620*a63188c8STomohiro Kusumi 				    HAMMER2_DEDUP_HEUR_SIZE);
621*a63188c8STomohiro Kusumi 		cbinfo.count_inodes_scanned = 0;
622*a63188c8STomohiro Kusumi 		cbinfo.count_dirents_scanned = 0;
623*a63188c8STomohiro Kusumi 		cbinfo.count_bytes_scanned = 0;
624*a63188c8STomohiro Kusumi 		cbinfo.count_chains_scanned = 0;
625*a63188c8STomohiro Kusumi 		cbinfo.count_chains_reported = 0;
626*a63188c8STomohiro Kusumi 
627*a63188c8STomohiro Kusumi 		incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE *
628*a63188c8STomohiro Kusumi 		       HAMMER2_FREEMAP_LEVEL1_SIZE;
629*a63188c8STomohiro Kusumi 		if (hmp->total_size - cbinfo.sbase <= incr) {
630*a63188c8STomohiro Kusumi 			cbinfo.sstop = hmp->total_size;
631*a63188c8STomohiro Kusumi 			allmedia = 1;
632*a63188c8STomohiro Kusumi 		} else {
633*a63188c8STomohiro Kusumi 			cbinfo.sstop = cbinfo.sbase + incr;
634*a63188c8STomohiro Kusumi 			allmedia = 0;
635*a63188c8STomohiro Kusumi 		}
636*a63188c8STomohiro Kusumi 		kprintf("hammer2: pass %016jx-%016jx ",
637*a63188c8STomohiro Kusumi 			(intmax_t)cbinfo.sbase,
638*a63188c8STomohiro Kusumi 			(intmax_t)cbinfo.sstop);
639*a63188c8STomohiro Kusumi 		if (allmedia && cbinfo.sbase == 0)
640*a63188c8STomohiro Kusumi 			kprintf("(all media)\n");
641*a63188c8STomohiro Kusumi 		else if (allmedia)
642*a63188c8STomohiro Kusumi 			kprintf("(remaining media)\n");
643*a63188c8STomohiro Kusumi 		else
644*a63188c8STomohiro Kusumi 			kprintf("(%jdGB of media)\n",
645*a63188c8STomohiro Kusumi 				(intmax_t)incr / (1024L*1024*1024));
646*a63188c8STomohiro Kusumi 
647*a63188c8STomohiro Kusumi 		/*
648*a63188c8STomohiro Kusumi 		 * Scan topology for stuff inside this range.
649*a63188c8STomohiro Kusumi 		 *
650*a63188c8STomohiro Kusumi 		 * NOTE - By not using a transaction the operation can
651*a63188c8STomohiro Kusumi 		 *	  run concurrent with the frontend as well as
652*a63188c8STomohiro Kusumi 		 *	  with flushes.
653*a63188c8STomohiro Kusumi 		 *
654*a63188c8STomohiro Kusumi 		 *	  We cannot safely set a mtid without a transaction,
655*a63188c8STomohiro Kusumi 		 *	  and in fact we don't want to set one anyway.  We
656*a63188c8STomohiro Kusumi 		 *	  want the bulkfree to be passive and no interfere
657*a63188c8STomohiro Kusumi 		 *	  with crash recovery.
658*a63188c8STomohiro Kusumi 		 */
659*a63188c8STomohiro Kusumi #undef HAMMER2_BULKFREE_TRANS	/* undef - don't use transaction */
660*a63188c8STomohiro Kusumi #ifdef HAMMER2_BULKFREE_TRANS
661*a63188c8STomohiro Kusumi 		hammer2_trans_init(hmp->spmp, 0);
662*a63188c8STomohiro Kusumi 		cbinfo.mtid = hammer2_trans_sub(hmp->spmp);
663*a63188c8STomohiro Kusumi #else
664*a63188c8STomohiro Kusumi 		cbinfo.mtid = 0;
665*a63188c8STomohiro Kusumi #endif
666*a63188c8STomohiro Kusumi 		cbinfo.pri = 0;
667*a63188c8STomohiro Kusumi 		error |= hammer2_bulkfree_scan(vchain,
668*a63188c8STomohiro Kusumi 					       h2_bulkfree_callback, &cbinfo);
669*a63188c8STomohiro Kusumi 
670*a63188c8STomohiro Kusumi 		while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL &&
671*a63188c8STomohiro Kusumi 		       (error & ~HAMMER2_ERROR_CHECK) == 0) {
672*a63188c8STomohiro Kusumi 			TAILQ_REMOVE(&cbinfo.list, save, entry);
673*a63188c8STomohiro Kusumi 			--cbinfo.list_count;
674*a63188c8STomohiro Kusumi 			cbinfo.pri = 0;
675*a63188c8STomohiro Kusumi 			cbinfo.backout = NULL;
676*a63188c8STomohiro Kusumi 			error |= hammer2_bulkfree_scan(save->chain,
677*a63188c8STomohiro Kusumi 						       h2_bulkfree_callback,
678*a63188c8STomohiro Kusumi 						       &cbinfo);
679*a63188c8STomohiro Kusumi 			hammer2_chain_drop(save->chain);
680*a63188c8STomohiro Kusumi 			kfree(save, M_HAMMER2);
681*a63188c8STomohiro Kusumi 		}
682*a63188c8STomohiro Kusumi 		while (save) {
683*a63188c8STomohiro Kusumi 			TAILQ_REMOVE(&cbinfo.list, save, entry);
684*a63188c8STomohiro Kusumi 			--cbinfo.list_count;
685*a63188c8STomohiro Kusumi 			hammer2_chain_drop(save->chain);
686*a63188c8STomohiro Kusumi 			kfree(save, M_HAMMER2);
687*a63188c8STomohiro Kusumi 			save = TAILQ_FIRST(&cbinfo.list);
688*a63188c8STomohiro Kusumi 		}
689*a63188c8STomohiro Kusumi 		cbinfo.backout = NULL;
690*a63188c8STomohiro Kusumi 
691*a63188c8STomohiro Kusumi 		/*
692*a63188c8STomohiro Kusumi 		 * If the complete scan succeeded we can synchronize our
693*a63188c8STomohiro Kusumi 		 * in-memory freemap against live storage.  If an abort
694*a63188c8STomohiro Kusumi 		 * occured we cannot safely synchronize our partially
695*a63188c8STomohiro Kusumi 		 * filled-out in-memory freemap.
696*a63188c8STomohiro Kusumi 		 *
697*a63188c8STomohiro Kusumi 		 * We still synchronize on CHECK failures.  That is, we still
698*a63188c8STomohiro Kusumi 		 * want bulkfree to operate even if the filesystem has defects.
699*a63188c8STomohiro Kusumi 		 */
700*a63188c8STomohiro Kusumi 		if (error & ~HAMMER2_ERROR_CHECK) {
701*a63188c8STomohiro Kusumi 			kprintf("bulkfree lastdrop %d %d error=0x%04x\n",
702*a63188c8STomohiro Kusumi 				vchain->refs, vchain->core.chain_count, error);
703*a63188c8STomohiro Kusumi 		} else {
704*a63188c8STomohiro Kusumi 			if (error & HAMMER2_ERROR_CHECK) {
705*a63188c8STomohiro Kusumi 				kprintf("bulkfree lastdrop %d %d "
706*a63188c8STomohiro Kusumi 					"(with check errors)\n",
707*a63188c8STomohiro Kusumi 					vchain->refs, vchain->core.chain_count);
708*a63188c8STomohiro Kusumi 			} else {
709*a63188c8STomohiro Kusumi 				kprintf("bulkfree lastdrop %d %d\n",
710*a63188c8STomohiro Kusumi 					vchain->refs, vchain->core.chain_count);
711*a63188c8STomohiro Kusumi 			}
712*a63188c8STomohiro Kusumi 
713*a63188c8STomohiro Kusumi 			error = h2_bulkfree_sync(&cbinfo);
714*a63188c8STomohiro Kusumi 
715*a63188c8STomohiro Kusumi 			hammer2_voldata_lock(hmp);
716*a63188c8STomohiro Kusumi 			hammer2_voldata_modify(hmp);
717*a63188c8STomohiro Kusumi 			hmp->voldata.allocator_free += cbinfo.adj_free;
718*a63188c8STomohiro Kusumi 			hammer2_voldata_unlock(hmp);
719*a63188c8STomohiro Kusumi 		}
720*a63188c8STomohiro Kusumi 
721*a63188c8STomohiro Kusumi 		/*
722*a63188c8STomohiro Kusumi 		 * Cleanup for next loop.
723*a63188c8STomohiro Kusumi 		 */
724*a63188c8STomohiro Kusumi #ifdef HAMMER2_BULKFREE_TRANS
725*a63188c8STomohiro Kusumi 		hammer2_trans_done(hmp->spmp, 0);
726*a63188c8STomohiro Kusumi #endif
727*a63188c8STomohiro Kusumi 		if (error & ~HAMMER2_ERROR_CHECK)
728*a63188c8STomohiro Kusumi 			break;
729*a63188c8STomohiro Kusumi 		cbinfo.sbase = cbinfo.sstop;
730*a63188c8STomohiro Kusumi 		cbinfo.adj_free = 0;
731*a63188c8STomohiro Kusumi 	}
732*a63188c8STomohiro Kusumi 	kfree(cbinfo.bmap, M_HAMMER2);
733*a63188c8STomohiro Kusumi 	kfree(cbinfo.dedup, M_HAMMER2);
734*a63188c8STomohiro Kusumi 	cbinfo.dedup = NULL;
735*a63188c8STomohiro Kusumi 
736*a63188c8STomohiro Kusumi 	bfi->sstop = cbinfo.sbase;
737*a63188c8STomohiro Kusumi 
738*a63188c8STomohiro Kusumi 	incr = bfi->sstop / (hmp->total_size / 10000);
739*a63188c8STomohiro Kusumi 	if (incr > 10000)
740*a63188c8STomohiro Kusumi 		incr = 10000;
741*a63188c8STomohiro Kusumi 
742*a63188c8STomohiro Kusumi 	kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n",
743*a63188c8STomohiro Kusumi 		(int)incr / 100,
744*a63188c8STomohiro Kusumi 		(int)incr % 100);
745*a63188c8STomohiro Kusumi 
746*a63188c8STomohiro Kusumi 	if (error & ~HAMMER2_ERROR_CHECK) {
747*a63188c8STomohiro Kusumi 		kprintf("    bulkfree was aborted\n");
748*a63188c8STomohiro Kusumi 	} else {
749*a63188c8STomohiro Kusumi 		if (error & HAMMER2_ERROR_CHECK) {
750*a63188c8STomohiro Kusumi 			kprintf("    WARNING: bulkfree "
751*a63188c8STomohiro Kusumi 				"encountered CRC errors\n");
752*a63188c8STomohiro Kusumi 		}
753*a63188c8STomohiro Kusumi 		kprintf("    transition->free   %ld\n", cbinfo.count_10_00);
754*a63188c8STomohiro Kusumi 		kprintf("    transition->staged %ld\n", cbinfo.count_11_10);
755*a63188c8STomohiro Kusumi 		kprintf("    ERR(00)->allocated %ld\n", cbinfo.count_00_11);
756*a63188c8STomohiro Kusumi 		kprintf("    ERR(01)->allocated %ld\n", cbinfo.count_01_11);
757*a63188c8STomohiro Kusumi 		kprintf("    staged->allocated  %ld\n", cbinfo.count_10_11);
758*a63188c8STomohiro Kusumi 		kprintf("    ~4MB segs cleaned  %ld\n", cbinfo.count_l0cleans);
759*a63188c8STomohiro Kusumi 		kprintf("    linear adjusts     %ld\n",
760*a63188c8STomohiro Kusumi 			cbinfo.count_linadjusts);
761*a63188c8STomohiro Kusumi 		kprintf("    dedup factor       %ld\n",
762*a63188c8STomohiro Kusumi 			cbinfo.count_dedup_factor);
763*a63188c8STomohiro Kusumi 		kprintf("    max saved chains   %ld\n", cbinfo.list_count_max);
764*a63188c8STomohiro Kusumi 	}
765*a63188c8STomohiro Kusumi 
766*a63188c8STomohiro Kusumi 	return error;
767*a63188c8STomohiro Kusumi }
768*a63188c8STomohiro Kusumi 
769*a63188c8STomohiro Kusumi static void
cbinfo_bmap_init(hammer2_bulkfree_info_t * cbinfo,size_t size)770*a63188c8STomohiro Kusumi cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size)
771*a63188c8STomohiro Kusumi {
772*a63188c8STomohiro Kusumi 	hammer2_bmap_data_t *bmap = cbinfo->bmap;
773*a63188c8STomohiro Kusumi 	hammer2_key_t key = cbinfo->sbase;
774*a63188c8STomohiro Kusumi 	hammer2_key_t lokey;
775*a63188c8STomohiro Kusumi 	hammer2_key_t hikey;
776*a63188c8STomohiro Kusumi 
777*a63188c8STomohiro Kusumi 	lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) &
778*a63188c8STomohiro Kusumi 		~HAMMER2_SEGMASK64;
779*a63188c8STomohiro Kusumi 	hikey = cbinfo->hmp->total_size & ~HAMMER2_SEGMASK64;
780*a63188c8STomohiro Kusumi 
781*a63188c8STomohiro Kusumi 	bzero(bmap, size);
782*a63188c8STomohiro Kusumi 	while (size) {
783*a63188c8STomohiro Kusumi 		bzero(bmap, sizeof(*bmap));
784*a63188c8STomohiro Kusumi 		if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX))
785*a63188c8STomohiro Kusumi 			lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX);
786*a63188c8STomohiro Kusumi 		if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64)
787*a63188c8STomohiro Kusumi 			lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64;
788*a63188c8STomohiro Kusumi 		if (key < lokey || key >= hikey) {
789*a63188c8STomohiro Kusumi 			memset(bmap->bitmapq, -1,
790*a63188c8STomohiro Kusumi 			       sizeof(bmap->bitmapq));
791*a63188c8STomohiro Kusumi 			bmap->avail = 0;
792*a63188c8STomohiro Kusumi 			bmap->linear = HAMMER2_SEGSIZE;
793*a63188c8STomohiro Kusumi 		} else {
794*a63188c8STomohiro Kusumi 			bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
795*a63188c8STomohiro Kusumi 		}
796*a63188c8STomohiro Kusumi 		size -= sizeof(*bmap);
797*a63188c8STomohiro Kusumi 		key += HAMMER2_FREEMAP_LEVEL0_SIZE;
798*a63188c8STomohiro Kusumi 		++bmap;
799*a63188c8STomohiro Kusumi 	}
800*a63188c8STomohiro Kusumi }
801*a63188c8STomohiro Kusumi 
802*a63188c8STomohiro Kusumi static int
h2_bulkfree_callback(hammer2_bulkfree_info_t * cbinfo,hammer2_blockref_t * bref)803*a63188c8STomohiro Kusumi h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref)
804*a63188c8STomohiro Kusumi {
805*a63188c8STomohiro Kusumi 	hammer2_bmap_data_t *bmap;
806*a63188c8STomohiro Kusumi 	hammer2_off_t data_off;
807*a63188c8STomohiro Kusumi 	uint16_t class;
808*a63188c8STomohiro Kusumi 	size_t bytes;
809*a63188c8STomohiro Kusumi 	int radix;
810*a63188c8STomohiro Kusumi 
811*a63188c8STomohiro Kusumi 	/*
812*a63188c8STomohiro Kusumi 	 * Check for signal and allow yield to userland during scan.
813*a63188c8STomohiro Kusumi 	 */
814*a63188c8STomohiro Kusumi 	if (hammer2_signal_check(&cbinfo->save_time))
815*a63188c8STomohiro Kusumi 		return HAMMER2_ERROR_ABORTED;
816*a63188c8STomohiro Kusumi 
817*a63188c8STomohiro Kusumi 	/*
818*a63188c8STomohiro Kusumi 	 * Deal with kernel thread cpu or I/O hogging by limiting the
819*a63188c8STomohiro Kusumi 	 * number of chains scanned per second to hammer2_bulkfree_tps.
820*a63188c8STomohiro Kusumi 	 * Ignore leaf records (DIRENT and DATA), no per-record I/O is
821*a63188c8STomohiro Kusumi 	 * involved for those since we don't load their data.
822*a63188c8STomohiro Kusumi 	 */
823*a63188c8STomohiro Kusumi 	if (bref->type != HAMMER2_BREF_TYPE_DATA &&
824*a63188c8STomohiro Kusumi 	    bref->type != HAMMER2_BREF_TYPE_DIRENT) {
825*a63188c8STomohiro Kusumi 		++cbinfo->bulkfree_calls;
826*a63188c8STomohiro Kusumi 		if (cbinfo->bulkfree_calls > hammer2_bulkfree_tps) {
827*a63188c8STomohiro Kusumi 			int dticks = ticks - cbinfo->bulkfree_ticks;
828*a63188c8STomohiro Kusumi 			if (dticks < 0)
829*a63188c8STomohiro Kusumi 				dticks = 0;
830*a63188c8STomohiro Kusumi 			if (dticks < hz) {
831*a63188c8STomohiro Kusumi 				tsleep(&cbinfo->bulkfree_ticks, 0,
832*a63188c8STomohiro Kusumi 				       "h2bw", hz - dticks);
833*a63188c8STomohiro Kusumi 			}
834*a63188c8STomohiro Kusumi 			cbinfo->bulkfree_calls = 0;
835*a63188c8STomohiro Kusumi 			cbinfo->bulkfree_ticks = ticks;
836*a63188c8STomohiro Kusumi 		}
837*a63188c8STomohiro Kusumi 	}
838*a63188c8STomohiro Kusumi 
839*a63188c8STomohiro Kusumi 	/*
840*a63188c8STomohiro Kusumi 	 * Calculate the data offset and determine if it is within
841*a63188c8STomohiro Kusumi 	 * the current freemap range being gathered.
842*a63188c8STomohiro Kusumi 	 */
843*a63188c8STomohiro Kusumi 	data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
844*a63188c8STomohiro Kusumi 	if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop)
845*a63188c8STomohiro Kusumi 		return 0;
846*a63188c8STomohiro Kusumi 	if (data_off < cbinfo->hmp->voldata.allocator_beg)
847*a63188c8STomohiro Kusumi 		return 0;
848*a63188c8STomohiro Kusumi 	if (data_off >= cbinfo->hmp->total_size)
849*a63188c8STomohiro Kusumi 		return 0;
850*a63188c8STomohiro Kusumi 
851*a63188c8STomohiro Kusumi 	/*
852*a63188c8STomohiro Kusumi 	 * Calculate the information needed to generate the in-memory
853*a63188c8STomohiro Kusumi 	 * freemap record.
854*a63188c8STomohiro Kusumi 	 *
855*a63188c8STomohiro Kusumi 	 * Hammer2 does not allow allocations to cross the L1 (1GB) boundary,
856*a63188c8STomohiro Kusumi 	 * it's a problem if it does.  (Or L0 (4MB) for that matter).
857*a63188c8STomohiro Kusumi 	 */
858*a63188c8STomohiro Kusumi 	radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
859*a63188c8STomohiro Kusumi 	KKASSERT(radix != 0);
860*a63188c8STomohiro Kusumi 	bytes = (size_t)1 << radix;
861*a63188c8STomohiro Kusumi 	class = (bref->type << 8) | HAMMER2_PBUFRADIX;
862*a63188c8STomohiro Kusumi 
863*a63188c8STomohiro Kusumi 	if (data_off + bytes > cbinfo->sstop) {
864*a63188c8STomohiro Kusumi 		kprintf("hammer2_bulkfree_scan: illegal 1GB boundary "
865*a63188c8STomohiro Kusumi 			"%016jx %016jx/%d\n",
866*a63188c8STomohiro Kusumi 			(intmax_t)bref->data_off,
867*a63188c8STomohiro Kusumi 			(intmax_t)bref->key,
868*a63188c8STomohiro Kusumi 			bref->keybits);
869*a63188c8STomohiro Kusumi 		bytes = cbinfo->sstop - data_off;	/* XXX */
870*a63188c8STomohiro Kusumi 	}
871*a63188c8STomohiro Kusumi 
872*a63188c8STomohiro Kusumi 	/*
873*a63188c8STomohiro Kusumi 	 * Convert to a storage offset relative to the beginning of the
874*a63188c8STomohiro Kusumi 	 * storage range we are collecting.  Then lookup the level0 bmap entry.
875*a63188c8STomohiro Kusumi 	 */
876*a63188c8STomohiro Kusumi 	data_off -= cbinfo->sbase;
877*a63188c8STomohiro Kusumi 	bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX);
878*a63188c8STomohiro Kusumi 
879*a63188c8STomohiro Kusumi 	/*
880*a63188c8STomohiro Kusumi 	 * Convert data_off to a bmap-relative value (~4MB storage range).
881*a63188c8STomohiro Kusumi 	 * Adjust linear, class, and avail.
882*a63188c8STomohiro Kusumi 	 *
883*a63188c8STomohiro Kusumi 	 * Hammer2 does not allow allocations to cross the L0 (4MB) boundary,
884*a63188c8STomohiro Kusumi 	 */
885*a63188c8STomohiro Kusumi 	data_off &= HAMMER2_FREEMAP_LEVEL0_MASK;
886*a63188c8STomohiro Kusumi 	if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) {
887*a63188c8STomohiro Kusumi 		kprintf("hammer2_bulkfree_scan: illegal 4MB boundary "
888*a63188c8STomohiro Kusumi 			"%016jx %016jx/%d\n",
889*a63188c8STomohiro Kusumi 			(intmax_t)bref->data_off,
890*a63188c8STomohiro Kusumi 			(intmax_t)bref->key,
891*a63188c8STomohiro Kusumi 			bref->keybits);
892*a63188c8STomohiro Kusumi 		bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off;
893*a63188c8STomohiro Kusumi 	}
894*a63188c8STomohiro Kusumi 
895*a63188c8STomohiro Kusumi 	if (bmap->class == 0) {
896*a63188c8STomohiro Kusumi 		bmap->class = class;
897*a63188c8STomohiro Kusumi 		bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
898*a63188c8STomohiro Kusumi 	}
899*a63188c8STomohiro Kusumi 
900*a63188c8STomohiro Kusumi 	/*
901*a63188c8STomohiro Kusumi 	 * NOTE: bmap->class does not have to match class.  Classification
902*a63188c8STomohiro Kusumi 	 *	 is relaxed when free space is low, so some mixing can occur.
903*a63188c8STomohiro Kusumi 	 */
904*a63188c8STomohiro Kusumi #if 0
905*a63188c8STomohiro Kusumi 	/*
906*a63188c8STomohiro Kusumi 	 * XXX removed
907*a63188c8STomohiro Kusumi 	 */
908*a63188c8STomohiro Kusumi 	if (bmap->class != class) {
909*a63188c8STomohiro Kusumi 		kprintf("hammer2_bulkfree_scan: illegal mixed class "
910*a63188c8STomohiro Kusumi 			"%016jx %016jx/%d (%04x vs %04x)\n",
911*a63188c8STomohiro Kusumi 			(intmax_t)bref->data_off,
912*a63188c8STomohiro Kusumi 			(intmax_t)bref->key,
913*a63188c8STomohiro Kusumi 			bref->keybits,
914*a63188c8STomohiro Kusumi 			class, bmap->class);
915*a63188c8STomohiro Kusumi 	}
916*a63188c8STomohiro Kusumi #endif
917*a63188c8STomohiro Kusumi 
918*a63188c8STomohiro Kusumi 	/*
919*a63188c8STomohiro Kusumi 	 * Just record the highest byte-granular offset for now.  Do not
920*a63188c8STomohiro Kusumi 	 * match against allocations which are in multiples of whole blocks.
921*a63188c8STomohiro Kusumi 	 *
922*a63188c8STomohiro Kusumi 	 * Make sure that any in-block linear offset at least covers the
923*a63188c8STomohiro Kusumi 	 * data range.  This can cause bmap->linear to become block-aligned.
924*a63188c8STomohiro Kusumi 	 */
925*a63188c8STomohiro Kusumi 	if (bytes & HAMMER2_FREEMAP_BLOCK_MASK) {
926*a63188c8STomohiro Kusumi 		if (bmap->linear < (int32_t)data_off + (int32_t)bytes)
927*a63188c8STomohiro Kusumi 			bmap->linear = (int32_t)data_off + (int32_t)bytes;
928*a63188c8STomohiro Kusumi 	} else if (bmap->linear >= (int32_t)data_off &&
929*a63188c8STomohiro Kusumi 		   bmap->linear < (int32_t)data_off + (int32_t)bytes) {
930*a63188c8STomohiro Kusumi 		bmap->linear = (int32_t)data_off + (int32_t)bytes;
931*a63188c8STomohiro Kusumi 	}
932*a63188c8STomohiro Kusumi 
933*a63188c8STomohiro Kusumi 	/*
934*a63188c8STomohiro Kusumi 	 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS].
935*a63188c8STomohiro Kusumi 	 * 64-bit entries, 2 bits per entry, to code 11.
936*a63188c8STomohiro Kusumi 	 *
937*a63188c8STomohiro Kusumi 	 * NOTE: data_off mask to 524288, shift right by 14 (radix for 16384),
938*a63188c8STomohiro Kusumi 	 *	 and multiply shift amount by 2 for sets of 2 bits.
939*a63188c8STomohiro Kusumi 	 *
940*a63188c8STomohiro Kusumi 	 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE.
941*a63188c8STomohiro Kusumi 	 *	 also, data_off may not be FREEMAP_BLOCK_SIZE aligned.
942*a63188c8STomohiro Kusumi 	 */
943*a63188c8STomohiro Kusumi 	while (bytes > 0) {
944*a63188c8STomohiro Kusumi 		hammer2_bitmap_t bmask;
945*a63188c8STomohiro Kusumi 		int bindex;
946*a63188c8STomohiro Kusumi 
947*a63188c8STomohiro Kusumi 		bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX +
948*a63188c8STomohiro Kusumi 					   HAMMER2_BMAP_INDEX_RADIX);
949*a63188c8STomohiro Kusumi 		bmask = (hammer2_bitmap_t)3 <<
950*a63188c8STomohiro Kusumi 			((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >>
951*a63188c8STomohiro Kusumi 			 HAMMER2_FREEMAP_BLOCK_RADIX) << 1);
952*a63188c8STomohiro Kusumi 
953*a63188c8STomohiro Kusumi 		/*
954*a63188c8STomohiro Kusumi 		 * NOTE! The (avail) calculation is bitmap-granular.  Multiple
955*a63188c8STomohiro Kusumi 		 *	 sub-granular records can wind up at the same bitmap
956*a63188c8STomohiro Kusumi 		 *	 position.
957*a63188c8STomohiro Kusumi 		 */
958*a63188c8STomohiro Kusumi 		if ((bmap->bitmapq[bindex] & bmask) == 0) {
959*a63188c8STomohiro Kusumi 			if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) {
960*a63188c8STomohiro Kusumi 				bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE;
961*a63188c8STomohiro Kusumi 			} else {
962*a63188c8STomohiro Kusumi 				bmap->avail -= bytes;
963*a63188c8STomohiro Kusumi 			}
964*a63188c8STomohiro Kusumi 			bmap->bitmapq[bindex] |= bmask;
965*a63188c8STomohiro Kusumi 		}
966*a63188c8STomohiro Kusumi 		data_off += HAMMER2_FREEMAP_BLOCK_SIZE;
967*a63188c8STomohiro Kusumi 		if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE)
968*a63188c8STomohiro Kusumi 			bytes = 0;
969*a63188c8STomohiro Kusumi 		else
970*a63188c8STomohiro Kusumi 			bytes -= HAMMER2_FREEMAP_BLOCK_SIZE;
971*a63188c8STomohiro Kusumi 	}
972*a63188c8STomohiro Kusumi 	return 0;
973*a63188c8STomohiro Kusumi }
974*a63188c8STomohiro Kusumi 
975*a63188c8STomohiro Kusumi /*
976*a63188c8STomohiro Kusumi  * Synchronize the in-memory bitmap with the live freemap.  This is not a
977*a63188c8STomohiro Kusumi  * direct copy.  Instead the bitmaps must be compared:
978*a63188c8STomohiro Kusumi  *
979*a63188c8STomohiro Kusumi  *	In-memory	Live-freemap
980*a63188c8STomohiro Kusumi  *	   00		  11 -> 10	(do nothing if live modified)
981*a63188c8STomohiro Kusumi  *			  10 -> 00	(do nothing if live modified)
982*a63188c8STomohiro Kusumi  *	   11		  10 -> 11	handles race against live
983*a63188c8STomohiro Kusumi  *			  ** -> 11	nominally warn of corruption
984*a63188c8STomohiro Kusumi  *
985*a63188c8STomohiro Kusumi  * We must also fixup the hints in HAMMER2_BREF_TYPE_FREEMAP_LEAF.
986*a63188c8STomohiro Kusumi  */
987*a63188c8STomohiro Kusumi static int
h2_bulkfree_sync(hammer2_bulkfree_info_t * cbinfo)988*a63188c8STomohiro Kusumi h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
989*a63188c8STomohiro Kusumi {
990*a63188c8STomohiro Kusumi 	hammer2_off_t data_off;
991*a63188c8STomohiro Kusumi 	hammer2_key_t key;
992*a63188c8STomohiro Kusumi 	hammer2_key_t key_dummy;
993*a63188c8STomohiro Kusumi 	hammer2_bmap_data_t *bmap;
994*a63188c8STomohiro Kusumi 	hammer2_bmap_data_t *live;
995*a63188c8STomohiro Kusumi 	hammer2_chain_t *live_parent;
996*a63188c8STomohiro Kusumi 	hammer2_chain_t *live_chain;
997*a63188c8STomohiro Kusumi 	int bmapindex;
998*a63188c8STomohiro Kusumi 	int error;
999*a63188c8STomohiro Kusumi 
1000*a63188c8STomohiro Kusumi 	kprintf("hammer2_bulkfree - range ");
1001*a63188c8STomohiro Kusumi 
1002*a63188c8STomohiro Kusumi 	if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg)
1003*a63188c8STomohiro Kusumi 		kprintf("%016jx-",
1004*a63188c8STomohiro Kusumi 			(intmax_t)cbinfo->hmp->voldata.allocator_beg);
1005*a63188c8STomohiro Kusumi 	else
1006*a63188c8STomohiro Kusumi 		kprintf("%016jx-",
1007*a63188c8STomohiro Kusumi 			(intmax_t)cbinfo->sbase);
1008*a63188c8STomohiro Kusumi 
1009*a63188c8STomohiro Kusumi 	if (cbinfo->sstop > cbinfo->hmp->total_size)
1010*a63188c8STomohiro Kusumi 		kprintf("%016jx\n",
1011*a63188c8STomohiro Kusumi 			(intmax_t)cbinfo->hmp->total_size);
1012*a63188c8STomohiro Kusumi 	else
1013*a63188c8STomohiro Kusumi 		kprintf("%016jx\n",
1014*a63188c8STomohiro Kusumi 			(intmax_t)cbinfo->sstop);
1015*a63188c8STomohiro Kusumi 
1016*a63188c8STomohiro Kusumi 	data_off = cbinfo->sbase;
1017*a63188c8STomohiro Kusumi 	bmap = cbinfo->bmap;
1018*a63188c8STomohiro Kusumi 
1019*a63188c8STomohiro Kusumi 	live_parent = &cbinfo->hmp->fchain;
1020*a63188c8STomohiro Kusumi 	hammer2_chain_ref(live_parent);
1021*a63188c8STomohiro Kusumi 	hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS);
1022*a63188c8STomohiro Kusumi 	live_chain = NULL;
1023*a63188c8STomohiro Kusumi 	error = 0;
1024*a63188c8STomohiro Kusumi 
1025*a63188c8STomohiro Kusumi 	/*
1026*a63188c8STomohiro Kusumi 	 * Iterate each hammer2_bmap_data_t line (128 bytes) managing
1027*a63188c8STomohiro Kusumi 	 * 4MB of storage.
1028*a63188c8STomohiro Kusumi 	 */
1029*a63188c8STomohiro Kusumi 	while (data_off < cbinfo->sstop) {
1030*a63188c8STomohiro Kusumi 		/*
1031*a63188c8STomohiro Kusumi 		 * The freemap is not used below allocator_beg or beyond
1032*a63188c8STomohiro Kusumi 		 * total_size.
1033*a63188c8STomohiro Kusumi 		 */
1034*a63188c8STomohiro Kusumi 
1035*a63188c8STomohiro Kusumi 		if (data_off < cbinfo->hmp->voldata.allocator_beg)
1036*a63188c8STomohiro Kusumi 			goto next;
1037*a63188c8STomohiro Kusumi 		if (data_off >= cbinfo->hmp->total_size)
1038*a63188c8STomohiro Kusumi 			goto next;
1039*a63188c8STomohiro Kusumi 
1040*a63188c8STomohiro Kusumi 		/*
1041*a63188c8STomohiro Kusumi 		 * Locate the freemap leaf on the live filesystem
1042*a63188c8STomohiro Kusumi 		 */
1043*a63188c8STomohiro Kusumi 		key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK);
1044*a63188c8STomohiro Kusumi 
1045*a63188c8STomohiro Kusumi 		if (live_chain == NULL || live_chain->bref.key != key) {
1046*a63188c8STomohiro Kusumi 			if (live_chain) {
1047*a63188c8STomohiro Kusumi 				hammer2_chain_unlock(live_chain);
1048*a63188c8STomohiro Kusumi 				hammer2_chain_drop(live_chain);
1049*a63188c8STomohiro Kusumi 			}
1050*a63188c8STomohiro Kusumi 			live_chain = hammer2_chain_lookup(
1051*a63188c8STomohiro Kusumi 					    &live_parent,
1052*a63188c8STomohiro Kusumi 					    &key_dummy,
1053*a63188c8STomohiro Kusumi 					    key,
1054*a63188c8STomohiro Kusumi 					    key + HAMMER2_FREEMAP_LEVEL1_MASK,
1055*a63188c8STomohiro Kusumi 					    &error,
1056*a63188c8STomohiro Kusumi 					    HAMMER2_LOOKUP_ALWAYS);
1057*a63188c8STomohiro Kusumi 			if (error) {
1058*a63188c8STomohiro Kusumi 				kprintf("hammer2_bulkfree: freemap lookup "
1059*a63188c8STomohiro Kusumi 					"error near %016jx, error %s\n",
1060*a63188c8STomohiro Kusumi 					(intmax_t)data_off,
1061*a63188c8STomohiro Kusumi 					hammer2_error_str(live_chain->error));
1062*a63188c8STomohiro Kusumi 				break;
1063*a63188c8STomohiro Kusumi 			}
1064*a63188c8STomohiro Kusumi 		}
1065*a63188c8STomohiro Kusumi 		if (live_chain == NULL) {
1066*a63188c8STomohiro Kusumi 			/*
1067*a63188c8STomohiro Kusumi 			 * XXX if we implement a full recovery mode we need
1068*a63188c8STomohiro Kusumi 			 * to create/recreate missing freemap chains if our
1069*a63188c8STomohiro Kusumi 			 * bmap has any allocated blocks.
1070*a63188c8STomohiro Kusumi 			 */
1071*a63188c8STomohiro Kusumi 			if (bmap->class &&
1072*a63188c8STomohiro Kusumi 			    bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) {
1073*a63188c8STomohiro Kusumi 				kprintf("hammer2_bulkfree: cannot locate "
1074*a63188c8STomohiro Kusumi 					"live leaf for allocated data "
1075*a63188c8STomohiro Kusumi 					"near %016jx\n",
1076*a63188c8STomohiro Kusumi 					(intmax_t)data_off);
1077*a63188c8STomohiro Kusumi 			}
1078*a63188c8STomohiro Kusumi 			goto next;
1079*a63188c8STomohiro Kusumi 		}
1080*a63188c8STomohiro Kusumi 		if (live_chain->error) {
1081*a63188c8STomohiro Kusumi 			kprintf("hammer2_bulkfree: unable to access freemap "
1082*a63188c8STomohiro Kusumi 				"near %016jx, error %s\n",
1083*a63188c8STomohiro Kusumi 				(intmax_t)data_off,
1084*a63188c8STomohiro Kusumi 				hammer2_error_str(live_chain->error));
1085*a63188c8STomohiro Kusumi 			hammer2_chain_unlock(live_chain);
1086*a63188c8STomohiro Kusumi 			hammer2_chain_drop(live_chain);
1087*a63188c8STomohiro Kusumi 			live_chain = NULL;
1088*a63188c8STomohiro Kusumi 			goto next;
1089*a63188c8STomohiro Kusumi 		}
1090*a63188c8STomohiro Kusumi 
1091*a63188c8STomohiro Kusumi 		bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >>
1092*a63188c8STomohiro Kusumi 			    HAMMER2_FREEMAP_LEVEL0_RADIX;
1093*a63188c8STomohiro Kusumi 		live = &live_chain->data->bmdata[bmapindex];
1094*a63188c8STomohiro Kusumi 
1095*a63188c8STomohiro Kusumi 		/*
1096*a63188c8STomohiro Kusumi 		 * Shortcut if the bitmaps match and the live linear
1097*a63188c8STomohiro Kusumi 		 * indicator is sane.  We can't do a perfect check of
1098*a63188c8STomohiro Kusumi 		 * live->linear because the only real requirement is that
1099*a63188c8STomohiro Kusumi 		 * if it is not block-aligned, that it not cover the space
1100*a63188c8STomohiro Kusumi 		 * within its current block which overlaps one of the data
1101*a63188c8STomohiro Kusumi 		 * ranges we scan.  We don't retain enough fine-grained
1102*a63188c8STomohiro Kusumi 		 * data in our scan to be able to set it exactly.
1103*a63188c8STomohiro Kusumi 		 *
1104*a63188c8STomohiro Kusumi 		 * TODO - we could shortcut this by testing that both
1105*a63188c8STomohiro Kusumi 		 * live->class and bmap->class are 0, and both avails are
1106*a63188c8STomohiro Kusumi 		 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB).
1107*a63188c8STomohiro Kusumi 		 */
1108*a63188c8STomohiro Kusumi 		if (bcmp(live->bitmapq, bmap->bitmapq,
1109*a63188c8STomohiro Kusumi 			 sizeof(bmap->bitmapq)) == 0 &&
1110*a63188c8STomohiro Kusumi 		    live->linear >= bmap->linear &&
1111*a63188c8STomohiro Kusumi 		    (hammer2_aux_flags & 1) == 0 &&
1112*a63188c8STomohiro Kusumi 		    bigmask_good(bmap, live_chain->bref.check.freemap.bigmask))
1113*a63188c8STomohiro Kusumi 		{
1114*a63188c8STomohiro Kusumi 			goto next;
1115*a63188c8STomohiro Kusumi 		}
1116*a63188c8STomohiro Kusumi 		if (hammer2_debug & 1) {
1117*a63188c8STomohiro Kusumi 			kprintf("live %016jx %04d.%04x (avail=%d) "
1118*a63188c8STomohiro Kusumi 				"bigmask %08x->%08x\n",
1119*a63188c8STomohiro Kusumi 				data_off, bmapindex, live->class, live->avail,
1120*a63188c8STomohiro Kusumi 				live_chain->bref.check.freemap.bigmask,
1121*a63188c8STomohiro Kusumi 				live_chain->bref.check.freemap.bigmask |
1122*a63188c8STomohiro Kusumi 				bigmask_get(bmap));
1123*a63188c8STomohiro Kusumi 		}
1124*a63188c8STomohiro Kusumi 
1125*a63188c8STomohiro Kusumi 		if (hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0)) {
1126*a63188c8STomohiro Kusumi 			kprintf("hammer2_bulkfree: unable to modify freemap "
1127*a63188c8STomohiro Kusumi 				"at %016jx for data-block %016jx, error %s\n",
1128*a63188c8STomohiro Kusumi 				live_chain->bref.data_off,
1129*a63188c8STomohiro Kusumi 				(intmax_t)data_off,
1130*a63188c8STomohiro Kusumi 				hammer2_error_str(live_chain->error));
1131*a63188c8STomohiro Kusumi 			hammer2_chain_unlock(live_chain);
1132*a63188c8STomohiro Kusumi 			hammer2_chain_drop(live_chain);
1133*a63188c8STomohiro Kusumi 			live_chain = NULL;
1134*a63188c8STomohiro Kusumi 			goto next;
1135*a63188c8STomohiro Kusumi 		}
1136*a63188c8STomohiro Kusumi 		live_chain->bref.check.freemap.bigmask = -1;
1137*a63188c8STomohiro Kusumi 		cbinfo->hmp->freemap_relaxed = 0;	/* reset heuristic */
1138*a63188c8STomohiro Kusumi 		live = &live_chain->data->bmdata[bmapindex];
1139*a63188c8STomohiro Kusumi 
1140*a63188c8STomohiro Kusumi 		h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap,
1141*a63188c8STomohiro Kusumi 					live_chain->bref.key +
1142*a63188c8STomohiro Kusumi 					bmapindex *
1143*a63188c8STomohiro Kusumi 					HAMMER2_FREEMAP_LEVEL0_SIZE);
1144*a63188c8STomohiro Kusumi next:
1145*a63188c8STomohiro Kusumi 		data_off += HAMMER2_FREEMAP_LEVEL0_SIZE;
1146*a63188c8STomohiro Kusumi 		++bmap;
1147*a63188c8STomohiro Kusumi 	}
1148*a63188c8STomohiro Kusumi 	if (live_chain) {
1149*a63188c8STomohiro Kusumi 		hammer2_chain_unlock(live_chain);
1150*a63188c8STomohiro Kusumi 		hammer2_chain_drop(live_chain);
1151*a63188c8STomohiro Kusumi 	}
1152*a63188c8STomohiro Kusumi 	if (live_parent) {
1153*a63188c8STomohiro Kusumi 		hammer2_chain_unlock(live_parent);
1154*a63188c8STomohiro Kusumi 		hammer2_chain_drop(live_parent);
1155*a63188c8STomohiro Kusumi 	}
1156*a63188c8STomohiro Kusumi 	return error;
1157*a63188c8STomohiro Kusumi }
1158*a63188c8STomohiro Kusumi 
1159*a63188c8STomohiro Kusumi /*
1160*a63188c8STomohiro Kusumi  * Merge the bulkfree bitmap against the existing bitmap.
1161*a63188c8STomohiro Kusumi  */
1162*a63188c8STomohiro Kusumi static
1163*a63188c8STomohiro Kusumi void
h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t * cbinfo,hammer2_off_t data_off,hammer2_bmap_data_t * live,hammer2_bmap_data_t * bmap,hammer2_key_t alloc_base)1164*a63188c8STomohiro Kusumi h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
1165*a63188c8STomohiro Kusumi 			hammer2_off_t data_off, hammer2_bmap_data_t *live,
1166*a63188c8STomohiro Kusumi 			hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base)
1167*a63188c8STomohiro Kusumi {
1168*a63188c8STomohiro Kusumi 	int bindex;
1169*a63188c8STomohiro Kusumi 	int scount;
1170*a63188c8STomohiro Kusumi 	hammer2_off_t tmp_off;
1171*a63188c8STomohiro Kusumi 	hammer2_bitmap_t lmask;
1172*a63188c8STomohiro Kusumi 	hammer2_bitmap_t mmask;
1173*a63188c8STomohiro Kusumi 
1174*a63188c8STomohiro Kusumi 	tmp_off = data_off;
1175*a63188c8STomohiro Kusumi 
1176*a63188c8STomohiro Kusumi 	for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) {
1177*a63188c8STomohiro Kusumi 		lmask = live->bitmapq[bindex];	/* live */
1178*a63188c8STomohiro Kusumi 		mmask = bmap->bitmapq[bindex];	/* snapshotted bulkfree */
1179*a63188c8STomohiro Kusumi 		if (lmask == mmask) {
1180*a63188c8STomohiro Kusumi 			tmp_off += HAMMER2_BMAP_INDEX_SIZE;
1181*a63188c8STomohiro Kusumi 			continue;
1182*a63188c8STomohiro Kusumi 		}
1183*a63188c8STomohiro Kusumi 
1184*a63188c8STomohiro Kusumi 		for (scount = 0;
1185*a63188c8STomohiro Kusumi 		     scount < HAMMER2_BMAP_BITS_PER_ELEMENT;
1186*a63188c8STomohiro Kusumi 		     scount += 2) {
1187*a63188c8STomohiro Kusumi 			if ((mmask & 3) == 0) {
1188*a63188c8STomohiro Kusumi 				/*
1189*a63188c8STomohiro Kusumi 				 * in-memory 00		live 11 -> 10
1190*a63188c8STomohiro Kusumi 				 *			live 10 -> 00
1191*a63188c8STomohiro Kusumi 				 *
1192*a63188c8STomohiro Kusumi 				 * Storage might be marked allocated or
1193*a63188c8STomohiro Kusumi 				 * staged and must be remarked staged or
1194*a63188c8STomohiro Kusumi 				 * free.
1195*a63188c8STomohiro Kusumi 				 */
1196*a63188c8STomohiro Kusumi 				switch (lmask & 3) {
1197*a63188c8STomohiro Kusumi 				case 0:	/* 00 */
1198*a63188c8STomohiro Kusumi 					break;
1199*a63188c8STomohiro Kusumi 				case 1:	/* 01 */
1200*a63188c8STomohiro Kusumi 					kprintf("hammer2_bulkfree: cannot "
1201*a63188c8STomohiro Kusumi 						"transition m=00/l=01\n");
1202*a63188c8STomohiro Kusumi 					break;
1203*a63188c8STomohiro Kusumi 				case 2:	/* 10 -> 00 */
1204*a63188c8STomohiro Kusumi 					live->bitmapq[bindex] &=
1205*a63188c8STomohiro Kusumi 					    ~((hammer2_bitmap_t)2 << scount);
1206*a63188c8STomohiro Kusumi 					live->avail +=
1207*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_SIZE;
1208*a63188c8STomohiro Kusumi 					if (live->avail >
1209*a63188c8STomohiro Kusumi 					    HAMMER2_FREEMAP_LEVEL0_SIZE) {
1210*a63188c8STomohiro Kusumi 						live->avail =
1211*a63188c8STomohiro Kusumi 						    HAMMER2_FREEMAP_LEVEL0_SIZE;
1212*a63188c8STomohiro Kusumi 					}
1213*a63188c8STomohiro Kusumi 					cbinfo->adj_free +=
1214*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_SIZE;
1215*a63188c8STomohiro Kusumi 					++cbinfo->count_10_00;
1216*a63188c8STomohiro Kusumi 					hammer2_io_dedup_assert(
1217*a63188c8STomohiro Kusumi 						cbinfo->hmp,
1218*a63188c8STomohiro Kusumi 						tmp_off |
1219*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_RADIX,
1220*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_SIZE);
1221*a63188c8STomohiro Kusumi 					break;
1222*a63188c8STomohiro Kusumi 				case 3:	/* 11 -> 10 */
1223*a63188c8STomohiro Kusumi 					live->bitmapq[bindex] &=
1224*a63188c8STomohiro Kusumi 					    ~((hammer2_bitmap_t)1 << scount);
1225*a63188c8STomohiro Kusumi 					++cbinfo->count_11_10;
1226*a63188c8STomohiro Kusumi 					hammer2_io_dedup_delete(
1227*a63188c8STomohiro Kusumi 						cbinfo->hmp,
1228*a63188c8STomohiro Kusumi 						HAMMER2_BREF_TYPE_DATA,
1229*a63188c8STomohiro Kusumi 						tmp_off |
1230*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_RADIX,
1231*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_SIZE);
1232*a63188c8STomohiro Kusumi 					break;
1233*a63188c8STomohiro Kusumi 				}
1234*a63188c8STomohiro Kusumi 			} else if ((mmask & 3) == 3) {
1235*a63188c8STomohiro Kusumi 				/*
1236*a63188c8STomohiro Kusumi 				 * in-memory 11		live 10 -> 11
1237*a63188c8STomohiro Kusumi 				 *			live ** -> 11
1238*a63188c8STomohiro Kusumi 				 *
1239*a63188c8STomohiro Kusumi 				 * Storage might be incorrectly marked free
1240*a63188c8STomohiro Kusumi 				 * or staged and must be remarked fully
1241*a63188c8STomohiro Kusumi 				 * allocated.
1242*a63188c8STomohiro Kusumi 				 */
1243*a63188c8STomohiro Kusumi 				switch (lmask & 3) {
1244*a63188c8STomohiro Kusumi 				case 0:	/* 00 */
1245*a63188c8STomohiro Kusumi 					/*
1246*a63188c8STomohiro Kusumi 					 * This case is not supposed to
1247*a63188c8STomohiro Kusumi 					 * happen.  If it does, it means
1248*a63188c8STomohiro Kusumi 					 * that an allocated block was
1249*a63188c8STomohiro Kusumi 					 * thought by the filesystem to be
1250*a63188c8STomohiro Kusumi 					 * free.
1251*a63188c8STomohiro Kusumi 					 */
1252*a63188c8STomohiro Kusumi 					kprintf("hammer2_bulkfree: "
1253*a63188c8STomohiro Kusumi 						"00->11 critical freemap "
1254*a63188c8STomohiro Kusumi 						"transition for datablock "
1255*a63188c8STomohiro Kusumi 						"%016jx\n",
1256*a63188c8STomohiro Kusumi 						tmp_off);
1257*a63188c8STomohiro Kusumi 					++cbinfo->count_00_11;
1258*a63188c8STomohiro Kusumi 					cbinfo->adj_free -=
1259*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_SIZE;
1260*a63188c8STomohiro Kusumi 					live->avail -=
1261*a63188c8STomohiro Kusumi 						HAMMER2_FREEMAP_BLOCK_SIZE;
1262*a63188c8STomohiro Kusumi 					if ((int32_t)live->avail < 0)
1263*a63188c8STomohiro Kusumi 						live->avail = 0;
1264*a63188c8STomohiro Kusumi 					break;
1265*a63188c8STomohiro Kusumi 				case 1:	/* 01 */
1266*a63188c8STomohiro Kusumi 					++cbinfo->count_01_11;
1267*a63188c8STomohiro Kusumi 					break;
1268*a63188c8STomohiro Kusumi 				case 2:	/* 10 -> 11 */
1269*a63188c8STomohiro Kusumi 					++cbinfo->count_10_11;
1270*a63188c8STomohiro Kusumi 					break;
1271*a63188c8STomohiro Kusumi 				case 3:	/* 11 */
1272*a63188c8STomohiro Kusumi 					break;
1273*a63188c8STomohiro Kusumi 				}
1274*a63188c8STomohiro Kusumi 				live->bitmapq[bindex] |=
1275*a63188c8STomohiro Kusumi 					((hammer2_bitmap_t)3 << scount);
1276*a63188c8STomohiro Kusumi 			}
1277*a63188c8STomohiro Kusumi 			mmask >>= 2;
1278*a63188c8STomohiro Kusumi 			lmask >>= 2;
1279*a63188c8STomohiro Kusumi 			tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE;
1280*a63188c8STomohiro Kusumi 		}
1281*a63188c8STomohiro Kusumi 	}
1282*a63188c8STomohiro Kusumi 
1283*a63188c8STomohiro Kusumi 	/*
1284*a63188c8STomohiro Kusumi 	 * Determine if the live bitmap is completely free and reset its
1285*a63188c8STomohiro Kusumi 	 * fields if so.  Otherwise check to see if we can reduce the linear
1286*a63188c8STomohiro Kusumi 	 * offset.
1287*a63188c8STomohiro Kusumi 	 */
1288*a63188c8STomohiro Kusumi 	for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) {
1289*a63188c8STomohiro Kusumi 		if (live->bitmapq[bindex] != 0)
1290*a63188c8STomohiro Kusumi 			break;
1291*a63188c8STomohiro Kusumi 	}
1292*a63188c8STomohiro Kusumi 	if (bindex < 0) {
1293*a63188c8STomohiro Kusumi 		/*
1294*a63188c8STomohiro Kusumi 		 * Completely empty, reset entire segment
1295*a63188c8STomohiro Kusumi 		 */
1296*a63188c8STomohiro Kusumi #if 0
1297*a63188c8STomohiro Kusumi 		kprintf("hammer2: cleanseg %016jx.%04x (%d)\n",
1298*a63188c8STomohiro Kusumi 			alloc_base, live->class, live->avail);
1299*a63188c8STomohiro Kusumi #endif
1300*a63188c8STomohiro Kusumi 		live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
1301*a63188c8STomohiro Kusumi 		live->class = 0;
1302*a63188c8STomohiro Kusumi 		live->linear = 0;
1303*a63188c8STomohiro Kusumi 		++cbinfo->count_l0cleans;
1304*a63188c8STomohiro Kusumi 	} else if (bindex < 7) {
1305*a63188c8STomohiro Kusumi 		/*
1306*a63188c8STomohiro Kusumi 		 * Partially full, bitmapq[bindex] != 0.  Our bulkfree pass
1307*a63188c8STomohiro Kusumi 		 * does not record enough information to set live->linear
1308*a63188c8STomohiro Kusumi 		 * exactly.
1309*a63188c8STomohiro Kusumi 		 *
1310*a63188c8STomohiro Kusumi 		 * NOTE: Setting live->linear to a sub-block (16K) boundary
1311*a63188c8STomohiro Kusumi 		 *	 forces the live code to iterate to the next fully
1312*a63188c8STomohiro Kusumi 		 *	 free block.  It does NOT mean that all blocks above
1313*a63188c8STomohiro Kusumi 		 *	 live->linear are available.
1314*a63188c8STomohiro Kusumi 		 *
1315*a63188c8STomohiro Kusumi 		 *	 Setting live->linear to a fragmentary (less than
1316*a63188c8STomohiro Kusumi 		 *	 16K) boundary allows allocations to iterate within
1317*a63188c8STomohiro Kusumi 		 *	 that sub-block.
1318*a63188c8STomohiro Kusumi 		 */
1319*a63188c8STomohiro Kusumi 		if (live->linear < bmap->linear &&
1320*a63188c8STomohiro Kusumi 		    ((live->linear ^ bmap->linear) &
1321*a63188c8STomohiro Kusumi 		     ~HAMMER2_FREEMAP_BLOCK_MASK) == 0) {
1322*a63188c8STomohiro Kusumi 			/*
1323*a63188c8STomohiro Kusumi 			 * If greater than but still within the same
1324*a63188c8STomohiro Kusumi 			 * sub-block as live we can adjust linear upward.
1325*a63188c8STomohiro Kusumi 			 */
1326*a63188c8STomohiro Kusumi 			live->linear = bmap->linear;
1327*a63188c8STomohiro Kusumi 			++cbinfo->count_linadjusts;
1328*a63188c8STomohiro Kusumi 		} else {
1329*a63188c8STomohiro Kusumi 			/*
1330*a63188c8STomohiro Kusumi 			 * Otherwise adjust to the nearest higher or same
1331*a63188c8STomohiro Kusumi 			 * sub-block boundary.  The live system may have
1332*a63188c8STomohiro Kusumi 			 * bounced live->linear around so we cannot make any
1333*a63188c8STomohiro Kusumi 			 * assumptions with regards to available fragmentary
1334*a63188c8STomohiro Kusumi 			 * allocations.
1335*a63188c8STomohiro Kusumi 			 */
1336*a63188c8STomohiro Kusumi 			live->linear =
1337*a63188c8STomohiro Kusumi 				(bmap->linear + HAMMER2_FREEMAP_BLOCK_MASK) &
1338*a63188c8STomohiro Kusumi 				~HAMMER2_FREEMAP_BLOCK_MASK;
1339*a63188c8STomohiro Kusumi 			++cbinfo->count_linadjusts;
1340*a63188c8STomohiro Kusumi 		}
1341*a63188c8STomohiro Kusumi 	} else {
1342*a63188c8STomohiro Kusumi 		/*
1343*a63188c8STomohiro Kusumi 		 * Completely full, effectively disable the linear iterator
1344*a63188c8STomohiro Kusumi 		 */
1345*a63188c8STomohiro Kusumi 		live->linear = HAMMER2_SEGSIZE;
1346*a63188c8STomohiro Kusumi 	}
1347*a63188c8STomohiro Kusumi 
1348*a63188c8STomohiro Kusumi #if 0
1349*a63188c8STomohiro Kusumi 	if (bmap->class) {
1350*a63188c8STomohiro Kusumi 		kprintf("%016jx %04d.%04x (avail=%7d) "
1351*a63188c8STomohiro Kusumi 			"%08x %08x %08x %08x %08x %08x %08x %08x\n",
1352*a63188c8STomohiro Kusumi 			(intmax_t)data_off,
1353*a63188c8STomohiro Kusumi 			(int)((data_off &
1354*a63188c8STomohiro Kusumi 			       HAMMER2_FREEMAP_LEVEL1_MASK) >>
1355*a63188c8STomohiro Kusumi 			      HAMMER2_FREEMAP_LEVEL0_RADIX),
1356*a63188c8STomohiro Kusumi 			bmap->class,
1357*a63188c8STomohiro Kusumi 			bmap->avail,
1358*a63188c8STomohiro Kusumi 			bmap->bitmap[0], bmap->bitmap[1],
1359*a63188c8STomohiro Kusumi 			bmap->bitmap[2], bmap->bitmap[3],
1360*a63188c8STomohiro Kusumi 			bmap->bitmap[4], bmap->bitmap[5],
1361*a63188c8STomohiro Kusumi 			bmap->bitmap[6], bmap->bitmap[7]);
1362*a63188c8STomohiro Kusumi 	}
1363*a63188c8STomohiro Kusumi #endif
1364*a63188c8STomohiro Kusumi }
1365*a63188c8STomohiro Kusumi 
1366*a63188c8STomohiro Kusumi /*
1367*a63188c8STomohiro Kusumi  * BULKFREE DEDUP HEURISTIC
1368*a63188c8STomohiro Kusumi  *
1369*a63188c8STomohiro Kusumi  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
1370*a63188c8STomohiro Kusumi  *	    All fields must be loaded into locals and validated.
1371*a63188c8STomohiro Kusumi  */
1372*a63188c8STomohiro Kusumi static
1373*a63188c8STomohiro Kusumi int
h2_bulkfree_test(hammer2_bulkfree_info_t * cbinfo,hammer2_blockref_t * bref,int pri,int saved_error)1374*a63188c8STomohiro Kusumi h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref,
1375*a63188c8STomohiro Kusumi 		 int pri, int saved_error)
1376*a63188c8STomohiro Kusumi {
1377*a63188c8STomohiro Kusumi 	hammer2_dedup_t *dedup;
1378*a63188c8STomohiro Kusumi 	int best;
1379*a63188c8STomohiro Kusumi 	int n;
1380*a63188c8STomohiro Kusumi 	int i;
1381*a63188c8STomohiro Kusumi 
1382*a63188c8STomohiro Kusumi 	n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off));
1383*a63188c8STomohiro Kusumi 	dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7));
1384*a63188c8STomohiro Kusumi 
1385*a63188c8STomohiro Kusumi 	for (i = best = 0; i < 8; ++i) {
1386*a63188c8STomohiro Kusumi 		if (dedup[i].data_off == bref->data_off) {
1387*a63188c8STomohiro Kusumi 			if (dedup[i].ticks < pri)
1388*a63188c8STomohiro Kusumi 				dedup[i].ticks = pri;
1389*a63188c8STomohiro Kusumi 			if (pri == 1)
1390*a63188c8STomohiro Kusumi 				cbinfo->count_dedup_factor += dedup[i].ticks;
1391*a63188c8STomohiro Kusumi 			return (dedup[i].saved_error | HAMMER2_ERROR_EOF);
1392*a63188c8STomohiro Kusumi 		}
1393*a63188c8STomohiro Kusumi 		if (dedup[i].ticks < dedup[best].ticks)
1394*a63188c8STomohiro Kusumi 			best = i;
1395*a63188c8STomohiro Kusumi 	}
1396*a63188c8STomohiro Kusumi 	dedup[best].data_off = bref->data_off;
1397*a63188c8STomohiro Kusumi 	dedup[best].ticks = pri;
1398*a63188c8STomohiro Kusumi 	dedup[best].saved_error = saved_error;
1399*a63188c8STomohiro Kusumi 
1400*a63188c8STomohiro Kusumi 	return 0;
1401*a63188c8STomohiro Kusumi }
1402*a63188c8STomohiro Kusumi 
1403*a63188c8STomohiro Kusumi /*
1404*a63188c8STomohiro Kusumi  * Calculate what the bigmask should be.  bigmask is permissive, so the
1405*a63188c8STomohiro Kusumi  * bits returned must be set at a minimum in the live bigmask.  Other bits
1406*a63188c8STomohiro Kusumi  * might also be set in the live bigmask.
1407*a63188c8STomohiro Kusumi  */
1408*a63188c8STomohiro Kusumi static uint32_t
bigmask_get(hammer2_bmap_data_t * bmap)1409*a63188c8STomohiro Kusumi bigmask_get(hammer2_bmap_data_t *bmap)
1410*a63188c8STomohiro Kusumi {
1411*a63188c8STomohiro Kusumi 	hammer2_bitmap_t mask;	/* 64-bit mask to check */
1412*a63188c8STomohiro Kusumi 	hammer2_bitmap_t scan;
1413*a63188c8STomohiro Kusumi 	uint32_t bigmask;
1414*a63188c8STomohiro Kusumi 	uint32_t radix_mask;
1415*a63188c8STomohiro Kusumi 	int iter;
1416*a63188c8STomohiro Kusumi 	int i;
1417*a63188c8STomohiro Kusumi 	int j;
1418*a63188c8STomohiro Kusumi 
1419*a63188c8STomohiro Kusumi 	bigmask = 0;
1420*a63188c8STomohiro Kusumi 	for (i = 0; i < HAMMER2_BMAP_ELEMENTS; ++i) {
1421*a63188c8STomohiro Kusumi 		mask = bmap->bitmapq[i];
1422*a63188c8STomohiro Kusumi 
1423*a63188c8STomohiro Kusumi 		radix_mask = 1U << HAMMER2_FREEMAP_BLOCK_RADIX;
1424*a63188c8STomohiro Kusumi 		radix_mask |= radix_mask - 1;
1425*a63188c8STomohiro Kusumi 		iter = 2;	/* each bitmap entry is 2 bits. 2, 4, 8... */
1426*a63188c8STomohiro Kusumi 		while (iter <= HAMMER2_BMAP_BITS_PER_ELEMENT) {
1427*a63188c8STomohiro Kusumi 			if (iter == HAMMER2_BMAP_BITS_PER_ELEMENT)
1428*a63188c8STomohiro Kusumi 				scan = -1;
1429*a63188c8STomohiro Kusumi 			else
1430*a63188c8STomohiro Kusumi 				scan = (1LU << iter) - 1;
1431*a63188c8STomohiro Kusumi 			j = 0;
1432*a63188c8STomohiro Kusumi 			while (j < HAMMER2_BMAP_BITS_PER_ELEMENT) {
1433*a63188c8STomohiro Kusumi 				/*
1434*a63188c8STomohiro Kusumi 				 * Check if all bits are 0 (free block).
1435*a63188c8STomohiro Kusumi 				 * If so, set the bit in bigmask for the
1436*a63188c8STomohiro Kusumi 				 * allocation radix under test.
1437*a63188c8STomohiro Kusumi 				 */
1438*a63188c8STomohiro Kusumi 				if ((scan & mask) == 0) {
1439*a63188c8STomohiro Kusumi 					bigmask |= radix_mask;
1440*a63188c8STomohiro Kusumi 				}
1441*a63188c8STomohiro Kusumi 				scan <<= iter;
1442*a63188c8STomohiro Kusumi 				j += iter;
1443*a63188c8STomohiro Kusumi 			}
1444*a63188c8STomohiro Kusumi 			iter <<= 1;
1445*a63188c8STomohiro Kusumi 			radix_mask = (radix_mask << 1) | 1;
1446*a63188c8STomohiro Kusumi 		}
1447*a63188c8STomohiro Kusumi 	}
1448*a63188c8STomohiro Kusumi 	return bigmask;
1449*a63188c8STomohiro Kusumi }
1450*a63188c8STomohiro Kusumi 
1451*a63188c8STomohiro Kusumi static int
bigmask_good(hammer2_bmap_data_t * bmap,uint32_t live_bigmask)1452*a63188c8STomohiro Kusumi bigmask_good(hammer2_bmap_data_t *bmap, uint32_t live_bigmask)
1453*a63188c8STomohiro Kusumi {
1454*a63188c8STomohiro Kusumi 	uint32_t bigmask;
1455*a63188c8STomohiro Kusumi 
1456*a63188c8STomohiro Kusumi 	bigmask = bigmask_get(bmap);
1457*a63188c8STomohiro Kusumi 	return ((live_bigmask & bigmask) == bigmask);
1458*a63188c8STomohiro Kusumi }
1459