1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/fcntl.h>
38 #include <sys/buf.h>
39 #include <sys/proc.h>
40 #include <sys/namei.h>
41 #include <sys/mount.h>
42 #include <sys/vnode.h>
43 #include <sys/mountctl.h>
44 #include <vm/vm_kern.h>
45 #include <vm/vm_extern.h>
46 
47 #include "hammer2.h"
48 
49 /*
50  * breadth-first search
51  */
52 typedef struct hammer2_chain_save {
53 	TAILQ_ENTRY(hammer2_chain_save)	entry;
54 	hammer2_chain_t	*chain;
55 	int pri;
56 } hammer2_chain_save_t;
57 
58 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save);
59 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t;
60 
61 typedef struct hammer2_bulkfree_info {
62 	hammer2_dev_t		*hmp;
63 	kmem_anon_desc_t	kp;
64 	hammer2_off_t		sbase;		/* sub-loop iteration */
65 	hammer2_off_t		sstop;
66 	hammer2_bmap_data_t	*bmap;
67 	int			depth;
68 	long			count_10_00;	/* staged->free	     */
69 	long			count_11_10;	/* allocated->staged */
70 	long			count_00_11;	/* (should not happen) */
71 	long			count_01_11;	/* (should not happen) */
72 	long			count_10_11;	/* staged->allocated */
73 	long			count_l0cleans;
74 	long			count_linadjusts;
75 	long			count_inodes_scanned;
76 	long			count_dedup_factor;
77 	long			bytes_scanned;
78 	hammer2_off_t		adj_free;
79 	hammer2_tid_t		mtid;
80 	hammer2_tid_t		saved_mirror_tid;
81 	time_t			save_time;
82 	hammer2_chain_save_list_t list;
83 	hammer2_dedup_t		*dedup;
84 	int			pri;
85 } hammer2_bulkfree_info_t;
86 
87 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info,
88 			hammer2_blockref_t *bref, int pri);
89 
90 /*
91  * General bulk scan function with callback.  Called with a referenced
92  * but UNLOCKED parent.  The parent is returned in the same state.
93  */
94 static
95 int
96 hammer2_bulk_scan(hammer2_chain_t *parent,
97 		  int (*func)(hammer2_bulkfree_info_t *info,
98 			      hammer2_blockref_t *bref),
99 		  hammer2_bulkfree_info_t *info)
100 {
101 	hammer2_blockref_t bref;
102 	hammer2_chain_t *chain;
103 	int cache_index = -1;
104 	int doabort = 0;
105 	int first = 1;
106 
107 	++info->pri;
108 
109 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
110 				   HAMMER2_RESOLVE_SHARED);
111 	chain = NULL;
112 
113 	/*
114 	 * Generally loop on the contents if we have not been flagged
115 	 * for abort.
116 	 *
117 	 * Remember that these chains are completely isolated from
118 	 * the frontend, so we can release locks temporarily without
119 	 * imploding.
120 	 */
121 	while ((doabort & HAMMER2_BULK_ABORT) == 0 &&
122 	       hammer2_chain_scan(parent, &chain, &bref, &first,
123 				  &cache_index,
124 				  HAMMER2_LOOKUP_NODATA |
125 				  HAMMER2_LOOKUP_SHARED) != NULL) {
126 		/*
127 		 * Process bref, chain is only non-NULL if the bref
128 		 * might be recursable (its possible that we sometimes get
129 		 * a non-NULL chain where the bref cannot be recursed).
130 		 */
131 #if 0
132 		kprintf("SCAN %016jx\n", bref.data_off);
133 		int xerr = tsleep(&info->pri, PCATCH, "slp", hz / 10);
134 		if (xerr == EINTR || xerr == ERESTART) {
135 			doabort |= HAMMER2_BULK_ABORT;
136 		}
137 #endif
138 		++info->pri;
139 		if (h2_bulkfree_test(info, &bref, 1))
140 			continue;
141 
142 		doabort |= func(info, &bref);
143 
144 		if (doabort & HAMMER2_BULK_ABORT)
145 			break;
146 
147 		/*
148 		 * A non-null chain is always returned if it is
149 		 * recursive, otherwise a non-null chain might be
150 		 * returned but usually is not when not recursive.
151 		 */
152 		if (chain == NULL)
153 			continue;
154 
155 		/*
156 		 * Else check type and setup depth-first scan.
157 		 *
158 		 * Account for bytes actually read.
159 		 */
160 		info->bytes_scanned += chain->bytes;
161 
162 		switch(chain->bref.type) {
163 		case HAMMER2_BREF_TYPE_INODE:
164 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
165 		case HAMMER2_BREF_TYPE_INDIRECT:
166 		case HAMMER2_BREF_TYPE_VOLUME:
167 		case HAMMER2_BREF_TYPE_FREEMAP:
168 			++info->depth;
169 			if (info->depth > 16) {
170 				hammer2_chain_save_t *save;
171 				save = kmalloc(sizeof(*save), M_HAMMER2,
172 					       M_WAITOK | M_ZERO);
173 				save->chain = chain;
174 				hammer2_chain_ref(chain);
175 				TAILQ_INSERT_TAIL(&info->list, save, entry);
176 
177 				/* guess */
178 				info->pri += 10;
179 			} else {
180 				int savepri = info->pri;
181 
182 				hammer2_chain_unlock(chain);
183 				info->pri = 0;
184 				doabort |= hammer2_bulk_scan(chain, func, info);
185 				info->pri += savepri;
186 				hammer2_chain_lock(chain,
187 						   HAMMER2_RESOLVE_ALWAYS |
188 						   HAMMER2_RESOLVE_SHARED);
189 			}
190 			--info->depth;
191 			break;
192 		default:
193 			/* does not recurse */
194 			break;
195 		}
196 	}
197 	if (chain) {
198 		hammer2_chain_unlock(chain);
199 		hammer2_chain_drop(chain);
200 	}
201 
202 	/*
203 	 * Save with higher pri now that we know what it is.
204 	 */
205 	h2_bulkfree_test(info, &parent->bref, info->pri + 1);
206 
207 	hammer2_chain_unlock(parent);
208 
209 	return doabort;
210 }
211 
212 /*
213  * Bulkfree algorithm
214  *
215  * Repeat {
216  *	Chain flush (partial synchronization) XXX removed
217  *	Scan the whole topology - build in-memory freemap (mark 11)
218  *	Reconcile the in-memory freemap against the on-disk freemap.
219  *		ondisk xx -> ondisk 11 (if allocated)
220  *		ondisk 11 -> ondisk 10 (if free in-memory)
221  *		ondisk 10 -> ondisk 00 (if free in-memory) - on next pass
222  * }
223  *
224  * The topology scan may have to be performed multiple times to window
225  * freemaps which are too large to fit in kernel memory.
226  *
227  * Races are handled using a double-transition (11->10, 10->00).  The bulkfree
228  * scan snapshots the volume root's blockset and thus can run concurrent with
229  * normal operations, as long as a full flush is made between each pass to
230  * synchronize any modified chains (otherwise their blocks might be improperly
231  * freed).
232  *
233  * Temporary memory in multiples of 64KB is required to reconstruct the leaf
234  * hammer2_bmap_data blocks so they can later be compared against the live
235  * freemap.  Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage.
236  * A 32MB save area thus represents around ~1 TB.  The temporary memory
237  * allocated can be specified.  If it is not sufficient multiple topology
238  * passes will be made.
239  */
240 
241 /*
242  * Bulkfree callback info
243  */
244 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo,
245 			hammer2_blockref_t *bref);
246 static void h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo);
247 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
248 			hammer2_bmap_data_t *live, hammer2_bmap_data_t *bmap,
249 			int nofree);
250 
251 int
252 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi)
253 {
254 	hammer2_bulkfree_info_t cbinfo;
255 	hammer2_chain_t *vchain;
256 	hammer2_chain_save_t *save;
257 	hammer2_off_t incr;
258 	size_t size;
259 	int doabort = 0;
260 
261 	/*
262 	 * A bulkfree operations lock is required for the duration.  We
263 	 * must hold it across our flushes to guarantee that we never run
264 	 * two bulkfree passes in a row without a flush in the middle.
265 	 */
266 	lockmgr(&hmp->bulklk, LK_EXCLUSIVE);
267 
268 	/*
269 	 * We have to clear the live dedup cache as it might have entries
270 	 * that are freeable as of now.  Any new entries in the dedup cache
271 	 * made after this point, even if they become freeable, will have
272 	 * previously been fully allocated and will be protected by the
273 	 * 2-stage bulkfree.
274 	 */
275 	hammer2_dedup_clear(hmp);
276 
277 #if 0
278 	/*
279 	 * XXX This has been removed.  Instead of trying to flush, which
280 	 * appears to have a ton of races against life chains even with
281 	 * the two-stage scan, we simply refuse to free any blocks
282 	 * related to freemap chains modified after the last filesystem
283 	 * sync.
284 	 *
285 	 * Do a quick flush so we can snapshot vchain for any blocks that
286 	 * have been allocated prior to this point.  We don't need to
287 	 * flush vnodes, logical buffers, or dirty inodes that have not
288 	 * allocated blocks yet.  We do not want to flush the device buffers
289 	 * nor do we want to flush the actual volume root to disk here,
290 	 * that is not needed to perform the snapshot.
291 	 */
292 	hammer2_flush_quick(hmp);
293 #endif
294 
295 	/*
296 	 * Setup for free pass
297 	 */
298 	bzero(&cbinfo, sizeof(cbinfo));
299 	size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) &
300 	       ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1);
301 	cbinfo.hmp = hmp;
302 	cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size);
303 	cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid;
304 
305 	cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE,
306 			       M_HAMMER2, M_WAITOK | M_ZERO);
307 
308 	/*
309 	 * Normalize start point to a 2GB boundary.  We operate on a
310 	 * 64KB leaf bitmap boundary which represents 2GB of storage.
311 	 */
312 	cbinfo.sbase = bfi->sbase;
313 	if (cbinfo.sbase > hmp->voldata.volu_size)
314 		cbinfo.sbase = hmp->voldata.volu_size;
315 	cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK;
316 
317 	/*
318 	 * The primary storage scan must use a snapshot of the volume
319 	 * root to avoid racing renames and other frontend work.
320 	 *
321 	 * Note that snapshots only snap synchronized storage, so
322 	 * we have to flush between each pass or we risk freeing
323 	 * storage allocated by the frontend.
324 	 */
325 	vchain = hammer2_chain_bulksnap(&hmp->vchain);
326 	TAILQ_INIT(&cbinfo.list);
327 
328 	/*
329 	 * Loop on a full meta-data scan as many times as required to
330 	 * get through all available storage.
331 	 */
332 	while (cbinfo.sbase < hmp->voldata.volu_size) {
333 		/*
334 		 * We have enough ram to represent (incr) bytes of storage.
335 		 * Each 64KB of ram represents 2GB of storage.
336 		 */
337 		bzero(cbinfo.bmap, size);
338 		incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE *
339 		       HAMMER2_FREEMAP_LEVEL1_SIZE;
340 		if (hmp->voldata.volu_size - cbinfo.sbase < incr)
341 			cbinfo.sstop = hmp->voldata.volu_size;
342 		else
343 			cbinfo.sstop = cbinfo.sbase + incr;
344 		if (hammer2_debug & 1)
345 		kprintf("bulkfree pass %016jx/%jdGB\n",
346 			(intmax_t)cbinfo.sbase,
347 			(intmax_t)incr / HAMMER2_FREEMAP_LEVEL1_SIZE);
348 
349 		hammer2_trans_init(hmp->spmp, 0);
350 		cbinfo.mtid = hammer2_trans_sub(hmp->spmp);
351 		cbinfo.pri = 0;
352 		doabort |= hammer2_bulk_scan(vchain, h2_bulkfree_callback,
353 					     &cbinfo);
354 
355 		while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL &&
356 		       doabort == 0) {
357 			TAILQ_REMOVE(&cbinfo.list, save, entry);
358 			cbinfo.pri = 0;
359 			doabort |= hammer2_bulk_scan(save->chain,
360 						     h2_bulkfree_callback,
361 						     &cbinfo);
362 			hammer2_chain_drop(save->chain);
363 			kfree(save, M_HAMMER2);
364 		}
365 		while (save) {
366 			TAILQ_REMOVE(&cbinfo.list, save, entry);
367 			hammer2_chain_drop(save->chain);
368 			kfree(save, M_HAMMER2);
369 			save = TAILQ_FIRST(&cbinfo.list);
370 		}
371 
372 		kprintf("bulkfree lastdrop %d %d\n",
373 			vchain->refs, vchain->core.chain_count);
374 
375 		/*
376 		 * If complete scan succeeded we can synchronize our
377 		 * in-memory freemap against live storage.  If an abort
378 		 * did occur we cannot safely synchronize our partially
379 		 * filled-out in-memory freemap.
380 		 */
381 		if (doabort == 0) {
382 			h2_bulkfree_sync(&cbinfo);
383 
384 			hammer2_voldata_lock(hmp);
385 			hammer2_voldata_modify(hmp);
386 			hmp->voldata.allocator_free += cbinfo.adj_free;
387 			hammer2_voldata_unlock(hmp);
388 		}
389 
390 		/*
391 		 * Cleanup for next loop.
392 		 */
393 		hammer2_trans_done(hmp->spmp);
394 		if (doabort)
395 			break;
396 		cbinfo.sbase = cbinfo.sstop;
397 	}
398 	hammer2_chain_bulkdrop(vchain);
399 	kmem_free_swapbacked(&cbinfo.kp);
400 	kfree(cbinfo.dedup, M_HAMMER2);
401 	cbinfo.dedup = NULL;
402 
403 	bfi->sstop = cbinfo.sbase;
404 
405 	incr = bfi->sstop / (hmp->voldata.volu_size / 10000);
406 	if (incr > 10000)
407 		incr = 10000;
408 
409 	kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n",
410 		(int)incr / 100,
411 		(int)incr % 100);
412 
413 	kprintf("    transition->free   %ld\n", cbinfo.count_10_00);
414 	kprintf("    transition->staged %ld\n", cbinfo.count_11_10);
415 	kprintf("    ERR(00)->allocated %ld\n", cbinfo.count_00_11);
416 	kprintf("    ERR(01)->allocated %ld\n", cbinfo.count_01_11);
417 	kprintf("    staged->allocated  %ld\n", cbinfo.count_10_11);
418 	kprintf("    ~2MB segs cleaned  %ld\n", cbinfo.count_l0cleans);
419 	kprintf("    linear adjusts     %ld\n", cbinfo.count_linadjusts);
420 	kprintf("    dedup factor       %ld\n", cbinfo.count_dedup_factor);
421 
422 	lockmgr(&hmp->bulklk, LK_RELEASE);
423 
424 	return doabort;
425 }
426 
427 static int
428 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref)
429 {
430 	hammer2_bmap_data_t *bmap;
431 	hammer2_off_t data_off;
432 	uint16_t class;
433 	size_t bytes;
434 	int radix;
435 	int error;
436 
437 	/*
438 	 * Check for signal and allow yield to userland during scan
439 	 */
440 	if (hammer2_signal_check(&cbinfo->save_time))
441 		return HAMMER2_BULK_ABORT;
442 	if (bref->type == HAMMER2_BREF_TYPE_INODE) {
443 		++cbinfo->count_inodes_scanned;
444 		if ((cbinfo->count_inodes_scanned & 1023) == 0)
445 			kprintf(" inodes %6ld bytes %9ld\n",
446 				cbinfo->count_inodes_scanned,
447 				cbinfo->bytes_scanned);
448 	}
449 
450 
451 	/*
452 	 * Calculate the data offset and determine if it is within
453 	 * the current freemap range being gathered.
454 	 */
455 	error = 0;
456 	data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
457 	if (data_off < cbinfo->sbase || data_off > cbinfo->sstop)
458 		return 0;
459 	if (data_off < cbinfo->hmp->voldata.allocator_beg)
460 		return 0;
461 	if (data_off > cbinfo->hmp->voldata.volu_size)
462 		return 0;
463 
464 	/*
465 	 * Calculate the information needed to generate the in-memory
466 	 * freemap record.
467 	 *
468 	 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary,
469 	 * it's a problem if it does.  (Or L0 (2MB) for that matter).
470 	 */
471 	radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
472 	bytes = (size_t)1 << radix;
473 	class = (bref->type << 8) | hammer2_devblkradix(radix);
474 
475 	if (data_off + bytes > cbinfo->sstop) {
476 		kprintf("hammer2_bulkfree_scan: illegal 2GB boundary "
477 			"%016jx %016jx/%d\n",
478 			(intmax_t)bref->data_off,
479 			(intmax_t)bref->key,
480 			bref->keybits);
481 		bytes = cbinfo->sstop - data_off;	/* XXX */
482 	}
483 
484 	/*
485 	 * Convert to a storage offset relative to the beginning of the
486 	 * storage range we are collecting.  Then lookup the level0 bmap entry.
487 	 */
488 	data_off -= cbinfo->sbase;
489 	bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX);
490 
491 	/*
492 	 * Convert data_off to a bmap-relative value (~2MB storage range).
493 	 * Adjust linear, class, and avail.
494 	 *
495 	 * Hammer2 does not allow allocations to cross the L0 (2MB) boundary,
496 	 */
497 	data_off &= HAMMER2_FREEMAP_LEVEL0_MASK;
498 	if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) {
499 		kprintf("hammer2_bulkfree_scan: illegal 2MB boundary "
500 			"%016jx %016jx/%d\n",
501 			(intmax_t)bref->data_off,
502 			(intmax_t)bref->key,
503 			bref->keybits);
504 		bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off;
505 	}
506 
507 	if (bmap->class == 0) {
508 		bmap->class = class;
509 		bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
510 	}
511 	if (bmap->class != class) {
512 		kprintf("hammer2_bulkfree_scan: illegal mixed class "
513 			"%016jx %016jx/%d (%04x vs %04x)\n",
514 			(intmax_t)bref->data_off,
515 			(intmax_t)bref->key,
516 			bref->keybits,
517 			class, bmap->class);
518 	}
519 	if (bmap->linear < (int32_t)data_off + (int32_t)bytes)
520 		bmap->linear = (int32_t)data_off + (int32_t)bytes;
521 
522 	/*
523 	 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS].
524 	 * 64-bit entries, 2 bits per entry, to code 11.
525 	 *
526 	 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE.
527 	 */
528 	while (bytes > 0) {
529 		int bindex;
530 		hammer2_bitmap_t bmask;
531 
532 		bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX +
533 					   HAMMER2_BMAP_INDEX_RADIX);
534 		bmask = (hammer2_bitmap_t)3 <<
535 			((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >>
536 			 HAMMER2_FREEMAP_BLOCK_RADIX) << 1);
537 
538 		/*
539 		 * NOTE! The (avail) calculation is bitmap-granular.  Multiple
540 		 *	 sub-granular records can wind up at the same bitmap
541 		 *	 position.
542 		 */
543 		if ((bmap->bitmapq[bindex] & bmask) == 0) {
544 			if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) {
545 				bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE;
546 			} else {
547 				bmap->avail -= bytes;
548 			}
549 			bmap->bitmapq[bindex] |= bmask;
550 		}
551 		data_off += HAMMER2_FREEMAP_BLOCK_SIZE;
552 		if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE)
553 			bytes = 0;
554 		else
555 			bytes -= HAMMER2_FREEMAP_BLOCK_SIZE;
556 	}
557 	return error;
558 }
559 
560 /*
561  * Synchronize the in-memory bitmap with the live freemap.  This is not a
562  * direct copy.  Instead the bitmaps must be compared:
563  *
564  *	In-memory	Live-freemap
565  *	   00		  11 -> 10	(do nothing if live modified)
566  *			  10 -> 00	(do nothing if live modified)
567  *	   11		  10 -> 11	handles race against live
568  *			  ** -> 11	nominally warn of corruption
569  *
570  */
571 static void
572 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
573 {
574 	hammer2_off_t data_off;
575 	hammer2_key_t key;
576 	hammer2_key_t key_dummy;
577 	hammer2_bmap_data_t *bmap;
578 	hammer2_bmap_data_t *live;
579 	hammer2_chain_t *live_parent;
580 	hammer2_chain_t *live_chain;
581 	int cache_index = -1;
582 	int bmapindex;
583 	int nofree;
584 
585 	kprintf("hammer2_bulkfree - range %016jx-%016jx\n",
586 		(intmax_t)cbinfo->sbase,
587 		(intmax_t)cbinfo->sstop);
588 
589 	data_off = cbinfo->sbase;
590 	bmap = cbinfo->bmap;
591 
592 	live_parent = &cbinfo->hmp->fchain;
593 	hammer2_chain_ref(live_parent);
594 	hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS);
595 	live_chain = NULL;
596 	nofree = 1;	/* safety */
597 
598 	/*
599 	 * Iterate each hammer2_bmap_data_t line (128 bytes) managing
600 	 * 4MB of storage.
601 	 */
602 	while (data_off < cbinfo->sstop) {
603 		/*
604 		 * The freemap is not used below allocator_beg or beyond
605 		 * volu_size.
606 		 */
607 		if (data_off < cbinfo->hmp->voldata.allocator_beg)
608 			goto next;
609 		if (data_off > cbinfo->hmp->voldata.volu_size)
610 			goto next;
611 
612 		/*
613 		 * Locate the freemap leaf on the live filesystem
614 		 */
615 		key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK);
616 		if (live_chain == NULL || live_chain->bref.key != key) {
617 			if (live_chain) {
618 				hammer2_chain_unlock(live_chain);
619 				hammer2_chain_drop(live_chain);
620 			}
621 			live_chain = hammer2_chain_lookup(
622 					    &live_parent,
623 					    &key_dummy,
624 					    key,
625 					    key + HAMMER2_FREEMAP_LEVEL1_MASK,
626 					    &cache_index,
627 					    HAMMER2_LOOKUP_ALWAYS);
628 			/*
629 			 * If recent allocations were made we avoid races by
630 			 * not staging or freeing any blocks.  We can still
631 			 * remark blocks as fully allocated.
632 			 */
633 			if (live_chain) {
634 				kprintf("live_chain %016jx\n", (intmax_t)key);
635 				if (live_chain->bref.mirror_tid >
636 				    cbinfo->saved_mirror_tid) {
637 					kprintf("hammer2_bulkfree: "
638 						"avoid %016jx\n",
639 						data_off);
640 					nofree = 1;
641 				} else {
642 					nofree = 0;
643 				}
644 			}
645 
646 		}
647 		if (live_chain == NULL) {
648 			/*
649 			 * XXX if we implement a full recovery mode we need
650 			 * to create/recreate missing freemap chains if our
651 			 * bmap has any allocated blocks.
652 			 */
653 			if (bmap->class &&
654 			    bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) {
655 				kprintf("hammer2_bulkfree: cannot locate "
656 					"live leaf for allocated data "
657 					"near %016jx\n",
658 					(intmax_t)data_off);
659 			}
660 			goto next;
661 		}
662 		if (live_chain->error) {
663 			kprintf("hammer2_bulkfree: error %s looking up "
664 				"live leaf for allocated data near %016jx\n",
665 				hammer2_error_str(live_chain->error),
666 				(intmax_t)data_off);
667 			hammer2_chain_unlock(live_chain);
668 			hammer2_chain_drop(live_chain);
669 			live_chain = NULL;
670 			goto next;
671 		}
672 
673 		bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >>
674 			    HAMMER2_FREEMAP_LEVEL0_RADIX;
675 		live = &live_chain->data->bmdata[bmapindex];
676 
677 		/*
678 		 * TODO - we could shortcut this by testing that both
679 		 * live->class and bmap->class are 0, and both avails are
680 		 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB).
681 		 */
682 		if (bcmp(live->bitmapq, bmap->bitmapq,
683 			 sizeof(bmap->bitmapq)) == 0) {
684 			goto next;
685 		}
686 		if (hammer2_debug & 1)
687 		kprintf("live %016jx %04d.%04x (avail=%d)\n",
688 			data_off, bmapindex, live->class, live->avail);
689 
690 		hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0);
691 
692 		h2_bulkfree_sync_adjust(cbinfo, live, bmap, nofree);
693 next:
694 		data_off += HAMMER2_FREEMAP_LEVEL0_SIZE;
695 		++bmap;
696 	}
697 	if (live_chain) {
698 		hammer2_chain_unlock(live_chain);
699 		hammer2_chain_drop(live_chain);
700 	}
701 	if (live_parent) {
702 		hammer2_chain_unlock(live_parent);
703 		hammer2_chain_drop(live_parent);
704 	}
705 }
706 
707 /*
708  * Merge the bulkfree bitmap against the existing bitmap.
709  *
710  * If nofree is non-zero the merge will only mark free blocks as allocated
711  * and will refuse to free any blocks.
712  */
713 static
714 void
715 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
716 			hammer2_bmap_data_t *live, hammer2_bmap_data_t *bmap,
717 			int nofree)
718 {
719 	int bindex;
720 	int scount;
721 	hammer2_bitmap_t lmask;
722 	hammer2_bitmap_t mmask;
723 
724 	for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) {
725 		lmask = live->bitmapq[bindex];
726 		mmask = bmap->bitmapq[bindex];
727 		if (lmask == mmask)
728 			continue;
729 
730 		for (scount = 0;
731 		     scount < HAMMER2_BMAP_BITS_PER_ELEMENT;
732 		     scount += 2) {
733 			if ((mmask & 3) == 0) {
734 				/*
735 				 * in-memory 00		live 11 -> 10
736 				 *			live 10 -> 00
737 				 *
738 				 * Storage might be marked allocated or
739 				 * staged and must be remarked staged or
740 				 * free.
741 				 */
742 				switch (lmask & 3) {
743 				case 0:	/* 00 */
744 					break;
745 				case 1:	/* 01 */
746 					kprintf("hammer2_bulkfree: cannot "
747 						"transition m=00/l=01\n");
748 					break;
749 				case 2:	/* 10 -> 00 */
750 					if (nofree)
751 						break;
752 					live->bitmapq[bindex] &=
753 					    ~((hammer2_bitmap_t)2 << scount);
754 					live->avail +=
755 						HAMMER2_FREEMAP_BLOCK_SIZE;
756 					if (live->avail >
757 					    HAMMER2_FREEMAP_LEVEL0_SIZE) {
758 						live->avail =
759 						    HAMMER2_FREEMAP_LEVEL0_SIZE;
760 					}
761 					cbinfo->adj_free +=
762 						HAMMER2_FREEMAP_BLOCK_SIZE;
763 					++cbinfo->count_10_00;
764 					break;
765 				case 3:	/* 11 -> 10 */
766 					if (nofree)
767 						break;
768 					live->bitmapq[bindex] &=
769 					    ~((hammer2_bitmap_t)1 << scount);
770 					++cbinfo->count_11_10;
771 					break;
772 				}
773 			} else if ((mmask & 3) == 3) {
774 				/*
775 				 * in-memory 11		live 10 -> 11
776 				 *			live ** -> 11
777 				 *
778 				 * Storage might be incorrectly marked free
779 				 * or staged and must be remarked fully
780 				 * allocated.
781 				 */
782 				switch (lmask & 3) {
783 				case 0:	/* 00 */
784 					++cbinfo->count_00_11;
785 					cbinfo->adj_free -=
786 						HAMMER2_FREEMAP_BLOCK_SIZE;
787 					live->avail -=
788 						HAMMER2_FREEMAP_BLOCK_SIZE;
789 					if ((int32_t)live->avail < 0)
790 						live->avail = 0;
791 					break;
792 				case 1:	/* 01 */
793 					++cbinfo->count_01_11;
794 					break;
795 				case 2:	/* 10 -> 11 */
796 					++cbinfo->count_10_11;
797 					break;
798 				case 3:	/* 11 */
799 					break;
800 				}
801 				live->bitmapq[bindex] |=
802 					((hammer2_bitmap_t)3 << scount);
803 			}
804 			mmask >>= 2;
805 			lmask >>= 2;
806 		}
807 	}
808 
809 	/*
810 	 * Determine if the live bitmap is completely free and reset its
811 	 * fields if so.  Otherwise check to see if we can reduce the linear
812 	 * offset.
813 	 */
814 	for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) {
815 		if (live->bitmapq[bindex] != 0)
816 			break;
817 	}
818 	if (nofree) {
819 		/* do nothing */
820 	} else if (bindex < 0) {
821 		live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
822 		live->class = 0;
823 		live->linear = 0;
824 		++cbinfo->count_l0cleans;
825 	} else if (bindex < 7) {
826 		++bindex;
827 		if (live->linear > bindex * HAMMER2_FREEMAP_BLOCK_SIZE) {
828 			live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE;
829 			++cbinfo->count_linadjusts;
830 		}
831 
832 		/*
833 		 * XXX this fine-grained measure still has some issues.
834 		 */
835 		if (live->linear < bindex * HAMMER2_FREEMAP_BLOCK_SIZE) {
836 			live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE;
837 			++cbinfo->count_linadjusts;
838 		}
839 	} else {
840 		live->linear = HAMMER2_SEGSIZE;
841 	}
842 
843 #if 0
844 	if (bmap->class) {
845 		kprintf("%016jx %04d.%04x (avail=%7d) "
846 			"%08x %08x %08x %08x %08x %08x %08x %08x\n",
847 			(intmax_t)data_off,
848 			(int)((data_off &
849 			       HAMMER2_FREEMAP_LEVEL1_MASK) >>
850 			      HAMMER2_FREEMAP_LEVEL0_RADIX),
851 			bmap->class,
852 			bmap->avail,
853 			bmap->bitmap[0], bmap->bitmap[1],
854 			bmap->bitmap[2], bmap->bitmap[3],
855 			bmap->bitmap[4], bmap->bitmap[5],
856 			bmap->bitmap[6], bmap->bitmap[7]);
857 	}
858 #endif
859 }
860 
861 /*
862  * BULKFREE DEDUP HEURISTIC
863  *
864  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
865  *	    All fields must be loaded into locals and validated.
866  */
867 static
868 int
869 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref,
870 		 int pri)
871 {
872 	hammer2_dedup_t *dedup;
873 	int best;
874 	int n;
875 	int i;
876 
877 	n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off));
878 	dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7));
879 
880 	for (i = best = 0; i < 8; ++i) {
881 		if (dedup[i].data_off == bref->data_off) {
882 			if (dedup[i].ticks < pri)
883 				dedup[i].ticks = pri;
884 			if (pri == 1)
885 				cbinfo->count_dedup_factor += dedup[i].ticks;
886 			return 1;
887 		}
888 		if (dedup[i].ticks < dedup[best].ticks)
889 			best = i;
890 	}
891 	dedup[best].data_off = bref->data_off;
892 	dedup[best].ticks = pri;
893 
894 	return 0;
895 }
896