xref: /dragonfly/sys/vfs/hammer/hammer_dedup.c (revision 10f4bf95)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 static __inline int validate_zone(hammer_off_t data_offset);
38 
39 int
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41 		 struct hammer_ioc_dedup *dedup)
42 {
43 	struct hammer_cursor cursor1, cursor2;
44 	int error;
45 
46 	/*
47 	 * Enforce hammer filesystem version requirements
48 	 */
49 	if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
50 		kprintf("hammer: Filesystem must be upgraded to v5 "
51 			"before you can run dedup\n");
52 		return (EOPNOTSUPP);
53 	}
54 
55 	/*
56 	 * Cursor1, return an error -> candidate goes to pass2 list
57 	 */
58 	error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
59 	if (error)
60 		goto done_cursor;
61 	cursor1.key_beg = dedup->elm1;
62 	cursor1.flags |= HAMMER_CURSOR_BACKEND;
63 
64 	error = hammer_btree_lookup(&cursor1);
65 	if (error)
66 		goto done_cursor;
67 	error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
68 						HAMMER_CURSOR_GET_DATA);
69 	if (error)
70 		goto done_cursor;
71 
72 	/*
73 	 * Cursor2, return an error -> candidate goes to pass2 list
74 	 */
75 	error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
76 	if (error)
77 		goto done_cursors;
78 	cursor2.key_beg = dedup->elm2;
79 	cursor2.flags |= HAMMER_CURSOR_BACKEND;
80 
81 	error = hammer_btree_lookup(&cursor2);
82 	if (error)
83 		goto done_cursors;
84 	error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
85 						HAMMER_CURSOR_GET_DATA);
86 	if (error)
87 		goto done_cursors;
88 
89 	/*
90 	 * Zone validation. We can't de-dup any of the other zones
91 	 * (BTREE or META) or bad things will happen.
92 	 *
93 	 * Return with error = 0, but set an INVALID_ZONE flag.
94 	 */
95 	error = validate_zone(cursor1.leaf->data_offset) +
96 			    validate_zone(cursor2.leaf->data_offset);
97 	if (error) {
98 		dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
99 		error = 0;
100 		goto done_cursors;
101 	}
102 
103 	/*
104 	 * Comparison checks
105 	 *
106 	 * If zones don't match or data_len fields aren't the same
107 	 * we consider it to be a comparison failure.
108 	 *
109 	 * Return with error = 0, but set a CMP_FAILURE flag.
110 	 */
111 	if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
112 	    (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
113 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
114 		goto done_cursors;
115 	}
116 	if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
117 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
118 		goto done_cursors;
119 	}
120 
121 	/* byte-by-byte comparison to be sure */
122 	if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
123 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
124 		goto done_cursors;
125 	}
126 
127 	/*
128 	 * Upgrade both cursors together to an exclusive lock
129 	 *
130 	 * Return an error -> candidate goes to pass2 list
131 	 */
132 	hammer_sync_lock_sh(trans);
133 	error = hammer_cursor_upgrade2(&cursor1, &cursor2);
134 	if (error) {
135 		hammer_sync_unlock(trans);
136 		goto done_cursors;
137 	}
138 
139 	error = hammer_blockmap_dedup(cursor1.trans,
140 			cursor1.leaf->data_offset, cursor1.leaf->data_len);
141 	if (error) {
142 		if (error == ERANGE) {
143 			/*
144 			 * Return with error = 0, but set an UNDERFLOW flag
145 			 */
146 			dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
147 			error = 0;
148 			goto downgrade_cursors;
149 		} else {
150 			/*
151 			 * Return an error -> block goes to pass2 list
152 			 */
153 			goto downgrade_cursors;
154 		}
155 	}
156 
157 	/*
158 	 * The cursor2's cache must be invalidated before calling
159 	 * hammer_blockmap_free(), otherwise it will not be able to
160 	 * invalidate the underlying data buffer.
161 	 */
162 	hammer_cursor_invalidate_cache(&cursor2);
163 	hammer_blockmap_free(cursor2.trans,
164 			cursor2.leaf->data_offset, cursor2.leaf->data_len);
165 
166 	hammer_modify_node(cursor2.trans, cursor2.node,
167 			&cursor2.leaf->data_offset, sizeof(hammer_off_t));
168 	cursor2.leaf->data_offset = cursor1.leaf->data_offset;
169 	hammer_modify_node_done(cursor2.node);
170 
171 downgrade_cursors:
172 	hammer_cursor_downgrade2(&cursor1, &cursor2);
173 	hammer_sync_unlock(trans);
174 done_cursors:
175 	hammer_done_cursor(&cursor2);
176 done_cursor:
177 	hammer_done_cursor(&cursor1);
178 	return (error);
179 }
180 
181 static __inline int
182 validate_zone(hammer_off_t data_offset)
183 {
184 	switch(data_offset & HAMMER_OFF_ZONE_MASK) {
185 	case HAMMER_ZONE_LARGE_DATA:
186 	case HAMMER_ZONE_SMALL_DATA:
187 		return (0);
188 	default:
189 		return (1);
190 	}
191 }
192 
193 /************************************************************************
194  *				LIVE DEDUP 				*
195  ************************************************************************
196  *
197  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
198  *
199  * The dedup cache is operated in a LRU fashion and destroyed on
200  * unmount, so essentially this is a live dedup on a cached dataset and
201  * not a full-fledged fs-wide one - we have a batched dedup for that.
202  * We allow duplicate entries in the buffer cache, data blocks are
203  * deduplicated only on their way to media.  By default the cache is
204  * populated on reads only, but it can be populated on writes too.
205  *
206  * The main implementation gotcha is on-media requirement - in order for
207  * a data block to be added to a dedup cache it has to be present on
208  * disk.  This simplifies cache logic a lot - once data is laid out on
209  * media it remains valid on media all the way up to the point where the
210  * related big block the data was stored in is freed - so there is only
211  * one place we need to bother with invalidation code.
212  */
213 
214 /*
215  * RB-Tree support for dedup cache structures
216  */
217 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
218 		hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
219 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
220 		hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
221 
222 struct hammer_dedup_inval {
223 	struct hammer_mount *hmp;
224 	hammer_off_t base_offset;
225 };
226 
227 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
228 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
229 		void *data);
230 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
231 		int *errorp);
232 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
233 		int *errorp);
234 
235 int
236 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
237 				hammer_dedup_cache_t dc2)
238 {
239 	if (dc1->crc < dc2->crc)
240 		return (-1);
241 	if (dc1->crc > dc2->crc)
242 		return (1);
243 
244 	return (0);
245 }
246 
247 int
248 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
249 				hammer_dedup_cache_t dc2)
250 {
251 	if (dc1->data_offset < dc2->data_offset)
252 		return (-1);
253 	if (dc1->data_offset > dc2->data_offset)
254 		return (1);
255 
256 	return (0);
257 }
258 
259 static int
260 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
261 {
262 	hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
263 
264 	if (dc->data_offset < off)
265 		return (-1);
266 	if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
267 		return (1);
268 
269 	return (0);
270 }
271 
272 static int
273 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
274 {
275 	hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
276 
277 	RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
278 	RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
279 	TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
280 
281 	--hmp->dedup_cache_count;
282 	kfree(dc, hmp->m_misc);
283 
284 	return (0);
285 }
286 
287 hammer_dedup_cache_t
288 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
289 {
290 	hammer_dedup_cache_t dcp, tmp;
291 	hammer_mount_t hmp = ip->hmp;
292 
293 	if (hmp->dedup_free_cache == NULL) {
294 		tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
295 		if (hmp->dedup_free_cache == NULL)
296 			hmp->dedup_free_cache = tmp;
297 		else
298 			kfree(tmp, hmp->m_misc);
299 	}
300 
301 	KKASSERT(leaf != NULL);
302 
303 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
304 				&hmp->rb_dedup_crc_root, leaf->data_crc);
305 	if (dcp != NULL) {
306 		RB_REMOVE(hammer_dedup_off_rb_tree,
307 				&hmp->rb_dedup_off_root, dcp);
308 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
309 		goto populate;
310 	}
311 
312 	if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
313 		dcp = hmp->dedup_free_cache;
314 		hmp->dedup_free_cache = NULL;
315 		++hmp->dedup_cache_count;
316 	} else {
317 		dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
318 		RB_REMOVE(hammer_dedup_crc_rb_tree,
319 				&hmp->rb_dedup_crc_root, dcp);
320 		RB_REMOVE(hammer_dedup_off_rb_tree,
321 				&hmp->rb_dedup_off_root, dcp);
322 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
323 	}
324 
325 	dcp->crc = leaf->data_crc;
326 	tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
327 	KKASSERT(tmp == NULL);
328 
329 populate:
330 	dcp->hmp = ip->hmp;
331 	dcp->obj_id = ip->obj_id;
332 	dcp->localization = ip->obj_localization;
333 	dcp->file_offset = leaf->base.key - leaf->data_len;
334 	dcp->bytes = leaf->data_len;
335 	dcp->data_offset = leaf->data_offset;
336 
337 	tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
338 	KKASSERT(tmp == NULL);
339 	TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
340 
341 	return (dcp);
342 }
343 
344 __inline hammer_dedup_cache_t
345 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
346 {
347 	hammer_dedup_cache_t dcp;
348 
349 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
350 				&hmp->rb_dedup_crc_root, crc);
351 	return dcp;
352 }
353 
354 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
355 {
356 	struct hammer_dedup_inval di;
357 
358 	di.hmp = hmp;
359 	di.base_offset = base_offset;
360 
361 	RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
362 		hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
363 }
364 
365 void
366 hammer_destroy_dedup_cache(hammer_mount_t hmp)
367 {
368 	hammer_dedup_cache_t dcp;
369 
370 	while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
371 		RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
372 		RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
373 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
374 		--hmp->dedup_cache_count;
375 		kfree(dcp, hmp->m_misc);
376 	}
377 
378 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
379 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
380 	KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
381 
382 	KKASSERT(hmp->dedup_cache_count == 0);
383 }
384 
385 int
386 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
387 		      void *data)
388 {
389 	int error;
390 
391 	/*
392 	 * Zone validation
393 	 */
394 	if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
395 		return (0);
396 
397 	/*
398 	 * Block length validation
399 	 */
400 	if (dcp->bytes != bytes)
401 		return (0);
402 
403 	/*
404 	 * Byte-by-byte data comparison
405 	 *
406 	 * The data we need for validation may already be present in the
407 	 * buffer cache in two flavours: vnode-based buffer or
408 	 * block-device-based buffer.  In case vnode-based buffer wasn't
409 	 * there or if a non-blocking attempt to acquire it failed use
410 	 * device-based buffer (for large-zone data blocks it will
411 	 * generate a separate read).
412 	 *
413 	 * XXX vnode-based checking is not MP safe so when live-dedup
414 	 *     is turned on we must always use the device buffer.
415 	 */
416 #if 0
417 	if (hammer_double_buffer) {
418 		error = 1;
419 	} else if (_vnode_validate(dcp, data, &error)) {
420 		hammer_live_dedup_vnode_bcmps++;
421 		return (1);
422 	} else {
423 		if (error == 3)
424 			hammer_live_dedup_findblk_failures++;
425 	}
426 
427 	/*
428 	 * If there was an error previously or if double buffering is
429 	 * enabled.
430 	 */
431 	if (error) {
432 		if (_dev_validate(dcp, data, &error)) {
433 			hammer_live_dedup_device_bcmps++;
434 			return (1);
435 		}
436 	}
437 #endif
438 	if (_dev_validate(dcp, data, &error)) {
439 		hammer_live_dedup_device_bcmps++;
440 		return (1);
441 	}
442 
443 	return (0);
444 }
445 
446 static __inline int
447 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
448 {
449 	struct hammer_transaction trans;
450 	hammer_inode_t ip;
451 	struct vnode *vp;
452 	struct buf *bp;
453 	off_t dooffset;
454 	int result, error;
455 
456 	result = error = 0;
457 	*errorp = 0;
458 
459 	hammer_simple_transaction(&trans, dcp->hmp);
460 
461 	ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
462 	    dcp->localization, 0, &error);
463 	if (ip == NULL) {
464 		kprintf("dedup: unable to find objid %016jx:%08x\n",
465 		    (intmax_t)dcp->obj_id, dcp->localization);
466 		*errorp = 1;
467 		goto failed2;
468 	}
469 
470 	error = hammer_get_vnode(ip, &vp);
471 	if (error) {
472 		kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
473 		    (intmax_t)dcp->obj_id, dcp->localization);
474 		*errorp = 2;
475 		goto failed;
476 	}
477 
478 	if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
479 		bremfree(bp);
480 
481 		/* XXX if (mapped to userspace) goto done, *errorp = 4 */
482 
483 		if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
484 			*errorp = 5;
485 			goto done;
486 		}
487 
488 		if (bp->b_bio2.bio_offset != dcp->data_offset) {
489 			error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
490 			    NULL, NULL, BUF_CMD_READ);
491 			if (error) {
492 				*errorp = 6;
493 				goto done;
494 			}
495 
496 			if (dooffset != dcp->data_offset) {
497 				*errorp = 7;
498 				goto done;
499 			}
500 			hammer_live_dedup_bmap_saves++;
501 		}
502 
503 		if (bcmp(data, bp->b_data, dcp->bytes) == 0)
504 			result = 1;
505 
506 done:
507 		bqrelse(bp);
508 	} else {
509 		*errorp = 3;
510 	}
511 	vput(vp);
512 
513 failed:
514 	hammer_rel_inode(ip, 0);
515 failed2:
516 	hammer_done_transaction(&trans);
517 	return (result);
518 }
519 
520 static __inline int
521 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
522 {
523 	hammer_buffer_t buffer = NULL;
524 	void *ondisk_data;
525 	int result, error;
526 
527 	result = error = 0;
528 	*errorp = 0;
529 
530 	ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
531 	    dcp->bytes, &error, &buffer);
532 	if (error) {
533 		*errorp = 1;
534 		goto failed;
535 	}
536 
537 	if (bcmp(data, ondisk_data, dcp->bytes) == 0)
538 		result = 1;
539 
540 failed:
541 	if (buffer)
542 		hammer_rel_buffer(buffer, 0);
543 
544 	return (result);
545 }
546