xref: /dragonfly/sys/vfs/hammer/hammer_dedup.c (revision 52f9f0d9)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 static __inline int validate_zone(hammer_off_t data_offset);
38 
39 int
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41 		 struct hammer_ioc_dedup *dedup)
42 {
43 	struct hammer_cursor cursor1, cursor2;
44 	int error;
45 	int seq;
46 
47 	/*
48 	 * Enforce hammer filesystem version requirements
49 	 */
50 	if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
51 		kprintf("hammer: Filesystem must be upgraded to v5 "
52 			"before you can run dedup\n");
53 		return (EOPNOTSUPP);
54 	}
55 
56 	/*
57 	 * Cursor1, return an error -> candidate goes to pass2 list
58 	 */
59 	error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
60 	if (error)
61 		goto done_cursor;
62 	cursor1.key_beg = dedup->elm1;
63 	cursor1.flags |= HAMMER_CURSOR_BACKEND;
64 
65 	error = hammer_btree_lookup(&cursor1);
66 	if (error)
67 		goto done_cursor;
68 	error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
69 						HAMMER_CURSOR_GET_DATA);
70 	if (error)
71 		goto done_cursor;
72 
73 	/*
74 	 * Cursor2, return an error -> candidate goes to pass2 list
75 	 */
76 	error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
77 	if (error)
78 		goto done_cursors;
79 	cursor2.key_beg = dedup->elm2;
80 	cursor2.flags |= HAMMER_CURSOR_BACKEND;
81 
82 	error = hammer_btree_lookup(&cursor2);
83 	if (error)
84 		goto done_cursors;
85 	error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
86 						HAMMER_CURSOR_GET_DATA);
87 	if (error)
88 		goto done_cursors;
89 
90 	/*
91 	 * Zone validation. We can't de-dup any of the other zones
92 	 * (BTREE or META) or bad things will happen.
93 	 *
94 	 * Return with error = 0, but set an INVALID_ZONE flag.
95 	 */
96 	error = validate_zone(cursor1.leaf->data_offset) +
97 			    validate_zone(cursor2.leaf->data_offset);
98 	if (error) {
99 		dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
100 		error = 0;
101 		goto done_cursors;
102 	}
103 
104 	/*
105 	 * Comparison checks
106 	 *
107 	 * If zones don't match or data_len fields aren't the same
108 	 * we consider it to be a comparison failure.
109 	 *
110 	 * Return with error = 0, but set a CMP_FAILURE flag.
111 	 */
112 	if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
113 	    (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
114 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
115 		goto done_cursors;
116 	}
117 	if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
118 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
119 		goto done_cursors;
120 	}
121 
122 	/* byte-by-byte comparison to be sure */
123 	if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
124 		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
125 		goto done_cursors;
126 	}
127 
128 	/*
129 	 * Upgrade both cursors together to an exclusive lock
130 	 *
131 	 * Return an error -> candidate goes to pass2 list
132 	 */
133 	hammer_sync_lock_sh(trans);
134 	error = hammer_cursor_upgrade2(&cursor1, &cursor2);
135 	if (error) {
136 		hammer_sync_unlock(trans);
137 		goto done_cursors;
138 	}
139 
140 	error = hammer_blockmap_dedup(cursor1.trans,
141 			cursor1.leaf->data_offset, cursor1.leaf->data_len);
142 	if (error) {
143 		if (error == ERANGE) {
144 			/*
145 			 * Return with error = 0, but set an UNDERFLOW flag
146 			 */
147 			dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
148 			error = 0;
149 			goto downgrade_cursors;
150 		} else {
151 			/*
152 			 * Return an error -> block goes to pass2 list
153 			 */
154 			goto downgrade_cursors;
155 		}
156 	}
157 
158 	/*
159 	 * The cursor2's cache must be invalidated before calling
160 	 * hammer_blockmap_free(), otherwise it will not be able to
161 	 * invalidate the underlying data buffer.
162 	 */
163 	hammer_cursor_invalidate_cache(&cursor2);
164 	hammer_blockmap_free(cursor2.trans,
165 			cursor2.leaf->data_offset, cursor2.leaf->data_len);
166 
167 	hammer_modify_node(cursor2.trans, cursor2.node,
168 			&cursor2.leaf->data_offset, sizeof(hammer_off_t));
169 	cursor2.leaf->data_offset = cursor1.leaf->data_offset;
170 	hammer_modify_node_done(cursor2.node);
171 
172 downgrade_cursors:
173 	hammer_cursor_downgrade2(&cursor1, &cursor2);
174 	hammer_sync_unlock(trans);
175 done_cursors:
176 	hammer_done_cursor(&cursor2);
177 done_cursor:
178 	hammer_done_cursor(&cursor1);
179 
180 	/*
181 	 * Avoid deadlocking the buffer cache
182 	 */
183 	seq = trans->hmp->flusher.done;
184 	while (hammer_flusher_meta_halflimit(trans->hmp) ||
185 	       hammer_flusher_undo_exhausted(trans, 2)) {
186 		hammer_flusher_wait(trans->hmp, seq);
187 		seq = hammer_flusher_async_one(trans->hmp);
188 	}
189 	return (error);
190 }
191 
192 static __inline int
193 validate_zone(hammer_off_t data_offset)
194 {
195 	switch(data_offset & HAMMER_OFF_ZONE_MASK) {
196 	case HAMMER_ZONE_LARGE_DATA:
197 	case HAMMER_ZONE_SMALL_DATA:
198 		return (0);
199 	default:
200 		return (1);
201 	}
202 }
203 
204 /************************************************************************
205  *				LIVE DEDUP 				*
206  ************************************************************************
207  *
208  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
209  *
210  * The dedup cache is operated in a LRU fashion and destroyed on
211  * unmount, so essentially this is a live dedup on a cached dataset and
212  * not a full-fledged fs-wide one - we have a batched dedup for that.
213  * We allow duplicate entries in the buffer cache, data blocks are
214  * deduplicated only on their way to media.  By default the cache is
215  * populated on reads only, but it can be populated on writes too.
216  *
217  * The main implementation gotcha is on-media requirement - in order for
218  * a data block to be added to a dedup cache it has to be present on
219  * disk.  This simplifies cache logic a lot - once data is laid out on
220  * media it remains valid on media all the way up to the point where the
221  * related big block the data was stored in is freed - so there is only
222  * one place we need to bother with invalidation code.
223  */
224 
225 /*
226  * RB-Tree support for dedup cache structures
227  */
228 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
229 		hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
230 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
231 		hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
232 
233 struct hammer_dedup_inval {
234 	struct hammer_mount *hmp;
235 	hammer_off_t base_offset;
236 };
237 
238 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
239 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
240 		void *data);
241 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
242 		int *errorp);
243 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
244 		int *errorp);
245 
246 int
247 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
248 				hammer_dedup_cache_t dc2)
249 {
250 	if (dc1->crc < dc2->crc)
251 		return (-1);
252 	if (dc1->crc > dc2->crc)
253 		return (1);
254 
255 	return (0);
256 }
257 
258 int
259 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
260 				hammer_dedup_cache_t dc2)
261 {
262 	if (dc1->data_offset < dc2->data_offset)
263 		return (-1);
264 	if (dc1->data_offset > dc2->data_offset)
265 		return (1);
266 
267 	return (0);
268 }
269 
270 static int
271 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
272 {
273 	hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
274 
275 	if (dc->data_offset < off)
276 		return (-1);
277 	if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
278 		return (1);
279 
280 	return (0);
281 }
282 
283 static int
284 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
285 {
286 	hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
287 
288 	RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
289 	RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
290 	TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
291 
292 	--hmp->dedup_cache_count;
293 	kfree(dc, hmp->m_misc);
294 
295 	return (0);
296 }
297 
298 hammer_dedup_cache_t
299 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
300 {
301 	hammer_dedup_cache_t dcp, tmp;
302 	hammer_mount_t hmp = ip->hmp;
303 
304 	if (hmp->dedup_free_cache == NULL) {
305 		tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
306 		if (hmp->dedup_free_cache == NULL)
307 			hmp->dedup_free_cache = tmp;
308 		else
309 			kfree(tmp, hmp->m_misc);
310 	}
311 
312 	KKASSERT(leaf != NULL);
313 
314 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
315 				&hmp->rb_dedup_crc_root, leaf->data_crc);
316 	if (dcp != NULL) {
317 		RB_REMOVE(hammer_dedup_off_rb_tree,
318 				&hmp->rb_dedup_off_root, dcp);
319 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
320 		goto populate;
321 	}
322 
323 	if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
324 		dcp = hmp->dedup_free_cache;
325 		hmp->dedup_free_cache = NULL;
326 		++hmp->dedup_cache_count;
327 	} else {
328 		dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
329 		RB_REMOVE(hammer_dedup_crc_rb_tree,
330 				&hmp->rb_dedup_crc_root, dcp);
331 		RB_REMOVE(hammer_dedup_off_rb_tree,
332 				&hmp->rb_dedup_off_root, dcp);
333 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
334 	}
335 
336 	dcp->crc = leaf->data_crc;
337 	tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
338 	KKASSERT(tmp == NULL);
339 
340 populate:
341 	dcp->hmp = ip->hmp;
342 	dcp->obj_id = ip->obj_id;
343 	dcp->localization = ip->obj_localization;
344 	dcp->file_offset = leaf->base.key - leaf->data_len;
345 	dcp->bytes = leaf->data_len;
346 	dcp->data_offset = leaf->data_offset;
347 
348 	tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
349 	KKASSERT(tmp == NULL);
350 	TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
351 
352 	return (dcp);
353 }
354 
355 __inline hammer_dedup_cache_t
356 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
357 {
358 	hammer_dedup_cache_t dcp;
359 
360 	dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
361 				&hmp->rb_dedup_crc_root, crc);
362 	return dcp;
363 }
364 
365 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
366 {
367 	struct hammer_dedup_inval di;
368 
369 	di.hmp = hmp;
370 	di.base_offset = base_offset;
371 
372 	RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
373 		hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
374 }
375 
376 void
377 hammer_destroy_dedup_cache(hammer_mount_t hmp)
378 {
379 	hammer_dedup_cache_t dcp;
380 
381 	while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
382 		RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
383 		RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
384 		TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
385 		--hmp->dedup_cache_count;
386 		kfree(dcp, hmp->m_misc);
387 	}
388 
389 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
390 	KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
391 	KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
392 
393 	KKASSERT(hmp->dedup_cache_count == 0);
394 }
395 
396 int
397 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
398 		      void *data)
399 {
400 	int error;
401 
402 	/*
403 	 * Zone validation
404 	 */
405 	if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
406 		return (0);
407 
408 	/*
409 	 * Block length validation
410 	 */
411 	if (dcp->bytes != bytes)
412 		return (0);
413 
414 	/*
415 	 * Byte-by-byte data comparison
416 	 *
417 	 * The data we need for validation may already be present in the
418 	 * buffer cache in two flavours: vnode-based buffer or
419 	 * block-device-based buffer.  In case vnode-based buffer wasn't
420 	 * there or if a non-blocking attempt to acquire it failed use
421 	 * device-based buffer (for large-zone data blocks it will
422 	 * generate a separate read).
423 	 *
424 	 * XXX vnode-based checking is not MP safe so when live-dedup
425 	 *     is turned on we must always use the device buffer.
426 	 */
427 #if 0
428 	if (hammer_double_buffer) {
429 		error = 1;
430 	} else if (_vnode_validate(dcp, data, &error)) {
431 		hammer_live_dedup_vnode_bcmps++;
432 		return (1);
433 	} else {
434 		if (error == 3)
435 			hammer_live_dedup_findblk_failures++;
436 	}
437 
438 	/*
439 	 * If there was an error previously or if double buffering is
440 	 * enabled.
441 	 */
442 	if (error) {
443 		if (_dev_validate(dcp, data, &error)) {
444 			hammer_live_dedup_device_bcmps++;
445 			return (1);
446 		}
447 	}
448 #endif
449 	if (_dev_validate(dcp, data, &error)) {
450 		hammer_live_dedup_device_bcmps++;
451 		return (1);
452 	}
453 
454 	return (0);
455 }
456 
457 static __inline int
458 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
459 {
460 	struct hammer_transaction trans;
461 	hammer_inode_t ip;
462 	struct vnode *vp;
463 	struct buf *bp;
464 	off_t dooffset;
465 	int result, error;
466 
467 	result = error = 0;
468 	*errorp = 0;
469 
470 	hammer_simple_transaction(&trans, dcp->hmp);
471 
472 	ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
473 	    dcp->localization, 0, &error);
474 	if (ip == NULL) {
475 		kprintf("dedup: unable to find objid %016jx:%08x\n",
476 		    (intmax_t)dcp->obj_id, dcp->localization);
477 		*errorp = 1;
478 		goto failed2;
479 	}
480 
481 	error = hammer_get_vnode(ip, &vp);
482 	if (error) {
483 		kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
484 		    (intmax_t)dcp->obj_id, dcp->localization);
485 		*errorp = 2;
486 		goto failed;
487 	}
488 
489 	if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
490 		bremfree(bp);
491 
492 		/* XXX if (mapped to userspace) goto done, *errorp = 4 */
493 
494 		if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
495 			*errorp = 5;
496 			goto done;
497 		}
498 
499 		if (bp->b_bio2.bio_offset != dcp->data_offset) {
500 			error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
501 			    NULL, NULL, BUF_CMD_READ);
502 			if (error) {
503 				*errorp = 6;
504 				goto done;
505 			}
506 
507 			if (dooffset != dcp->data_offset) {
508 				*errorp = 7;
509 				goto done;
510 			}
511 			hammer_live_dedup_bmap_saves++;
512 		}
513 
514 		if (bcmp(data, bp->b_data, dcp->bytes) == 0)
515 			result = 1;
516 
517 done:
518 		bqrelse(bp);
519 	} else {
520 		*errorp = 3;
521 	}
522 	vput(vp);
523 
524 failed:
525 	hammer_rel_inode(ip, 0);
526 failed2:
527 	hammer_done_transaction(&trans);
528 	return (result);
529 }
530 
531 static __inline int
532 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
533 {
534 	hammer_buffer_t buffer = NULL;
535 	void *ondisk_data;
536 	int result, error;
537 
538 	result = error = 0;
539 	*errorp = 0;
540 
541 	ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
542 	    dcp->bytes, &error, &buffer);
543 	if (error) {
544 		*errorp = 1;
545 		goto failed;
546 	}
547 
548 	if (bcmp(data, ondisk_data, dcp->bytes) == 0)
549 		result = 1;
550 
551 failed:
552 	if (buffer)
553 		hammer_rel_buffer(buffer, 0);
554 
555 	return (result);
556 }
557