xref: /dragonfly/sys/vfs/hammer/hammer_dedup.c (revision a4fe36f1)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 static __inline int validate_zone(hammer_off_t data_offset);
38 
39 int
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41 		 struct hammer_ioc_dedup *dedup)
42 {
43 	struct hammer_cursor cursor1, cursor2;
44 	int error;
45 	int seq;
46 
47 	/*
48 	 * Enforce hammer filesystem version requirements
49 	 */
50 	if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
51 		hkprintf("Filesystem must be upgraded to v5 "
52 			"before you can run dedup\n");
53 		return (EOPNOTSUPP);
54 	}
55 
56 	/*
57 	 * Cursor1, return an error -> candidate goes to pass2 list
58 	 */
59 	error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
60 	if (error)
61 		goto done_cursor;
62 	cursor1.key_beg = dedup->elm1;
63 	cursor1.flags |= HAMMER_CURSOR_BACKEND;
64 
65 	error = hammer_btree_lookup(&cursor1);
66 	if (error)
67 		goto done_cursor;
68 	error = hammer_btree_extract_data(&cursor1);
69 	if (error)
70 		goto done_cursor;
71 
72 	/*
73 	 * Cursor2, return an error -> candidate goes to pass2 list
74 	 */
75 	error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
76 	if (error)
77 		goto done_cursors;
78 	cursor2.key_beg = dedup->elm2;
79 	cursor2.flags |= HAMMER_CURSOR_BACKEND;
80 
81 	error = hammer_btree_lookup(&cursor2);
82 	if (error)
83 		goto done_cursors;
84 	error = hammer_btree_extract_data(&cursor2);
85 	if (error)
86 		goto done_cursors;
87 
88 	/*
89 	 * Zone validation. We can't de-dup any of the other zones
90 	 * (BTREE or META) or bad things will happen.
91 	 *
92 	 * Return with error = 0, but set an INVALID_ZONE flag.
93 	 */
94 	error = validate_zone(cursor1.leaf->data_offset) +
95 			    validate_zone(cursor2.leaf->data_offset);
96 	if (error) {
97 		dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
98 		error = 0;
99 		goto done_cursors;
100 	}
101 
102 	/*
103 	 * Comparison checks
104 	 *
105 	 * If zones don't match or data_len fields aren't the same
106 	 * we consider it to be a comparison failure.
107 	 *
108 	 * Return with error = 0, but set a CMP_FAILURE flag.
109 	 */
110 	if (HAMMER_ZONE(cursor1.leaf->data_offset) !=
111 	    HAMMER_ZONE(cursor2.leaf->data_offset)) {
112 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
113 		goto done_cursors;
114 	}
115 	if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
116 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
117 		goto done_cursors;
118 	}
119 
120 	/* byte-by-byte comparison to be sure */
121 	if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
122 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
123 		goto done_cursors;
124 	}
125 
126 	/*
127 	 * Upgrade both cursors together to an exclusive lock
128 	 *
129 	 * Return an error -> candidate goes to pass2 list
130 	 */
131 	hammer_sync_lock_sh(trans);
132 	error = hammer_cursor_upgrade2(&cursor1, &cursor2);
133 	if (error) {
134 		hammer_sync_unlock(trans);
135 		goto done_cursors;
136 	}
137 
138 	error = hammer_blockmap_dedup(cursor1.trans,
139 			cursor1.leaf->data_offset, cursor1.leaf->data_len);
140 	if (error) {
141 		if (error == ERANGE) {
142 			/* Return with error = 0, but set an UNDERFLOW flag */
143 			dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
144 			error = 0;
145 		}
146 
147 		/* Return all other errors -> block goes to pass2 list */
148 		goto downgrade_cursors;
149 	}
150 
151 	/*
152 	 * The cursor2's cache must be invalidated before calling
153 	 * hammer_blockmap_free(), otherwise it will not be able to
154 	 * invalidate the underlying data buffer.
155 	 */
156 	hammer_cursor_invalidate_cache(&cursor2);
157 	hammer_blockmap_free(cursor2.trans,
158 			cursor2.leaf->data_offset, cursor2.leaf->data_len);
159 
160 	hammer_modify_node(cursor2.trans, cursor2.node,
161 			&cursor2.leaf->data_offset, sizeof(hammer_off_t));
162 	cursor2.leaf->data_offset = cursor1.leaf->data_offset;
163 	hammer_modify_node_done(cursor2.node);
164 
165 downgrade_cursors:
166 	hammer_cursor_downgrade2(&cursor1, &cursor2);
167 	hammer_sync_unlock(trans);
168 done_cursors:
169 	hammer_done_cursor(&cursor2);
170 done_cursor:
171 	hammer_done_cursor(&cursor1);
172 
173 	/*
174 	 * Avoid deadlocking the buffer cache
175 	 */
176 	seq = trans->hmp->flusher.done;
177 	while (hammer_flusher_meta_halflimit(trans->hmp) ||
178 	       hammer_flusher_undo_exhausted(trans, 2)) {
179 		hammer_flusher_wait(trans->hmp, seq);
180 		seq = hammer_flusher_async_one(trans->hmp);
181 	}
182 	return (error);
183 }
184 
185 static __inline int
186 validate_zone(hammer_off_t data_offset)
187 {
188 	switch(HAMMER_ZONE(data_offset)) {
189 	case HAMMER_ZONE_LARGE_DATA:
190 	case HAMMER_ZONE_SMALL_DATA:
191 		return (0);
192 	default:
193 		return (1);
194 	}
195 }
196 
197 /************************************************************************
198  *				LIVE DEDUP 				*
199  ************************************************************************
200  *
201  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
202  *
203  * The dedup cache is operated in a LRU fashion and destroyed on
204  * unmount, so essentially this is a live dedup on a cached dataset and
205  * not a full-fledged fs-wide one - we have a batched dedup for that.
206  * We allow duplicate entries in the buffer cache, data blocks are
207  * deduplicated only on their way to media.  By default the cache is
208  * populated on reads only, but it can be populated on writes too.
209  *
210  * The main implementation gotcha is on-media requirement - in order for
211  * a data block to be added to a dedup cache it has to be present on
212  * disk.  This simplifies cache logic a lot - once data is laid out on
213  * media it remains valid on media all the way up to the point where the
214  * related big-block the data was stored in is freed - so there is only
215  * one place we need to bother with invalidation code.
216  */
217 
218 /*
219  * RB-Tree support for dedup cache structures
220  */
221 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
222 		hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
223 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
224 		hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
225 
226 struct hammer_dedup_inval {
227 	struct hammer_mount *hmp;
228 	hammer_off_t base_offset;
229 };
230 
231 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
232 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
233 		void *data);
234 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
235 		int *errorp);
236 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
237 		int *errorp);
238 
239 int
240 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
241 {
242 	if (dc1->crc < dc2->crc)
243 		return (-1);
244 	if (dc1->crc > dc2->crc)
245 		return (1);
246 
247 	return (0);
248 }
249 
250 int
251 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
252 {
253 	if (dc1->data_offset < dc2->data_offset)
254 		return (-1);
255 	if (dc1->data_offset > dc2->data_offset)
256 		return (1);
257 
258 	return (0);
259 }
260 
261 static int
262 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
263 {
264 	hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
265 
266 	if (dc->data_offset < off)
267 		return (-1);
268 	if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE)
269 		return (1);
270 
271 	return (0);
272 }
273 
274 static int
275 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
276 {
277 	hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
278 
279 	RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
280 	RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
281 	TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
282 
283 	--hmp->dedup_cache_count;
284 	kfree(dc, hmp->m_misc);
285 
286 	return (0);
287 }
288 
289 hammer_dedup_cache_t
290 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
291 {
292 	hammer_dedup_cache_t dcp, tmp;
293 	hammer_mount_t hmp = ip->hmp;
294 
295 	if (hmp->dedup_free_cache == NULL) {
296 		tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
297 		if (hmp->dedup_free_cache == NULL)
298 			hmp->dedup_free_cache = tmp;
299 		else
300 			kfree(tmp, hmp->m_misc);
301 	}
302 
303 	KKASSERT(leaf != NULL);
304 
305 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
306 				&hmp->rb_dedup_crc_root, leaf->data_crc);
307 	if (dcp != NULL) {
308 		RB_REMOVE(hammer_dedup_off_rb_tree,
309 				&hmp->rb_dedup_off_root, dcp);
310 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
311 		goto populate;
312 	}
313 
314 	if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
315 		dcp = hmp->dedup_free_cache;
316 		hmp->dedup_free_cache = NULL;
317 		++hmp->dedup_cache_count;
318 	} else {
319 		dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
320 		RB_REMOVE(hammer_dedup_crc_rb_tree,
321 				&hmp->rb_dedup_crc_root, dcp);
322 		RB_REMOVE(hammer_dedup_off_rb_tree,
323 				&hmp->rb_dedup_off_root, dcp);
324 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
325 	}
326 
327 	dcp->crc = leaf->data_crc;
328 	tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
329 	KKASSERT(tmp == NULL);
330 
331 populate:
332 	dcp->hmp = ip->hmp;
333 	dcp->obj_id = ip->obj_id;
334 	dcp->localization = ip->obj_localization;
335 	dcp->file_offset = leaf->base.key - leaf->data_len;
336 	dcp->bytes = leaf->data_len;
337 	dcp->data_offset = leaf->data_offset;
338 
339 	tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
340 	KKASSERT(tmp == NULL);
341 	TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
342 
343 	return (dcp);
344 }
345 
346 __inline hammer_dedup_cache_t
347 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
348 {
349 	hammer_dedup_cache_t dcp;
350 
351 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
352 				&hmp->rb_dedup_crc_root, crc);
353 	return dcp;
354 }
355 
356 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
357 {
358 	struct hammer_dedup_inval di;
359 
360 	di.hmp = hmp;
361 	di.base_offset = base_offset;
362 
363 	RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
364 		hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
365 }
366 
367 void
368 hammer_destroy_dedup_cache(hammer_mount_t hmp)
369 {
370 	hammer_dedup_cache_t dcp;
371 
372 	while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
373 		RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
374 		RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
375 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
376 		--hmp->dedup_cache_count;
377 		kfree(dcp, hmp->m_misc);
378 	}
379 
380 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
381 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
382 	KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
383 
384 	KKASSERT(hmp->dedup_cache_count == 0);
385 }
386 
387 int
388 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data)
389 {
390 	int error;
391 
392 	/*
393 	 * Zone validation
394 	 */
395 	if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
396 		return (0);
397 
398 	/*
399 	 * Block length validation
400 	 */
401 	if (dcp->bytes != bytes)
402 		return (0);
403 
404 	/*
405 	 * Byte-by-byte data comparison
406 	 *
407 	 * The data we need for validation may already be present in the
408 	 * buffer cache in two flavours: vnode-based buffer or
409 	 * block-device-based buffer.  In case vnode-based buffer wasn't
410 	 * there or if a non-blocking attempt to acquire it failed use
411 	 * device-based buffer (for large-zone data blocks it will
412 	 * generate a separate read).
413 	 *
414 	 * XXX vnode-based checking is not MP safe so when live-dedup
415 	 *     is turned on we must always use the device buffer.
416 	 */
417 #if 0
418 	if (hammer_double_buffer) {
419 		error = 1;
420 	} else if (_vnode_validate(dcp, data, &error)) {
421 		hammer_live_dedup_vnode_bcmps++;
422 		return (1);
423 	} else {
424 		if (error == 3)
425 			hammer_live_dedup_findblk_failures++;
426 	}
427 
428 	/*
429 	 * If there was an error previously or if double buffering is
430 	 * enabled.
431 	 */
432 	if (error) {
433 		if (_dev_validate(dcp, data, &error)) {
434 			hammer_live_dedup_device_bcmps++;
435 			return (1);
436 		}
437 	}
438 #endif
439 	if (_dev_validate(dcp, data, &error)) {
440 		hammer_live_dedup_device_bcmps++;
441 		return (1);
442 	}
443 
444 	return (0);
445 }
446 
447 static __inline int
448 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
449 {
450 	struct hammer_transaction trans;
451 	hammer_inode_t ip;
452 	struct vnode *vp;
453 	struct buf *bp;
454 	off_t dooffset;
455 	int result, error;
456 
457 	result = error = 0;
458 	*errorp = 0;
459 
460 	hammer_simple_transaction(&trans, dcp->hmp);
461 
462 	ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
463 	    dcp->localization, 0, &error);
464 	if (ip == NULL) {
465 		hkprintf("dedup: unable to find objid %016jx:%08x\n",
466 		    (intmax_t)dcp->obj_id, dcp->localization);
467 		*errorp = 1;
468 		goto failed2;
469 	}
470 
471 	error = hammer_get_vnode(ip, &vp);
472 	if (error) {
473 		hkprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
474 		    (intmax_t)dcp->obj_id, dcp->localization);
475 		*errorp = 2;
476 		goto failed;
477 	}
478 
479 	if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
480 		bremfree(bp);
481 
482 		/* XXX if (mapped to userspace) goto done, *errorp = 4 */
483 
484 		if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
485 			*errorp = 5;
486 			goto done;
487 		}
488 
489 		if (bp->b_bio2.bio_offset != dcp->data_offset) {
490 			error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
491 			    NULL, NULL, BUF_CMD_READ);
492 			if (error) {
493 				*errorp = 6;
494 				goto done;
495 			}
496 
497 			if (dooffset != dcp->data_offset) {
498 				*errorp = 7;
499 				goto done;
500 			}
501 			hammer_live_dedup_bmap_saves++;
502 		}
503 
504 		if (bcmp(data, bp->b_data, dcp->bytes) == 0)
505 			result = 1;
506 
507 done:
508 		bqrelse(bp);
509 	} else {
510 		*errorp = 3;
511 	}
512 	vput(vp);
513 
514 failed:
515 	hammer_rel_inode(ip, 0);
516 failed2:
517 	hammer_done_transaction(&trans);
518 	return (result);
519 }
520 
521 static __inline int
522 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
523 {
524 	hammer_buffer_t buffer = NULL;
525 	void *ondisk_data;
526 	int result, error;
527 
528 	result = error = 0;
529 	*errorp = 0;
530 
531 	ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
532 	    dcp->bytes, &error, &buffer);
533 	if (error) {
534 		*errorp = 1;
535 		goto failed;
536 	}
537 
538 	if (bcmp(data, ondisk_data, dcp->bytes) == 0)
539 		result = 1;
540 
541 failed:
542 	if (buffer)
543 		hammer_rel_buffer(buffer, 0);
544 
545 	return (result);
546 }
547